diff --git a/.cargo/katex-header.html b/.cargo/katex-header.html
new file mode 100644
index 0000000000..5db5bc0b19
--- /dev/null
+++ b/.cargo/katex-header.html
@@ -0,0 +1,30 @@
+<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.11.1/dist/katex.min.css"
+    integrity="sha384-zB1R0rpPzHqg7Kpt0Aljp8JPLqbXI3bhnPWROx27a9N0Ll6ZP/+DiW/UqRcLbRjq" crossorigin="anonymous">
+<script defer src="https://cdn.jsdelivr.net/npm/katex@0.11.1/dist/katex.min.js"
+    integrity="sha384-y23I5Q6l+B6vatafAwxRu/0oK/79VlbSz7Q9aiSZUvyWYIYsd+qj+o24G5ZU2zJz"
+    crossorigin="anonymous"></script>
+<script defer src="https://cdn.jsdelivr.net/npm/katex@0.11.1/dist/contrib/auto-render.min.js"
+    integrity="sha384-kWPLUVMOks5AQFrykwIup5lo0m3iMkkHrD0uJ4H5cjeGihAutqP0yW0J6dpFiVkI"
+    crossorigin="anonymous"></script>
+<script>
+    document.addEventListener("DOMContentLoaded", function () {
+        renderMathInElement(document.body, {
+            fleqn: false,
+            macros: {
+                "\\F": "\\mathbb{F}",
+                "\\G": "\\mathbb{G}",
+                "\\O": "\\mathcal{O}",
+                "\\(": "\\left(",
+                "\\)": "\\right)",
+                "\\norm": "\\left\\vert #1 \\right\\vert",
+                "\\set": "\\mathcal{ #1 }",
+            },
+            delimiters: [
+                { left: "$$", right: "$$", display: true },
+                { left: "\\(", right: "\\)", display: false },
+                { left: "$", right: "$", display: false },
+                { left: "\\[", right: "\\]", display: true }
+            ]
+        });
+    });
+</script>
\ No newline at end of file
diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml
index a0ac3ec727..1af066714e 100644
--- a/.github/workflows/continuous-integration-workflow.yml
+++ b/.github/workflows/continuous-integration-workflow.yml
@@ -10,39 +10,35 @@ on:
     branches:
       - "**"
 
-jobs:  
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+    CARGO_TERM_COLOR: always
+
+jobs:
   test:
     name: Test Suite
     runs-on: ubuntu-latest
+    timeout-minutes: 30
     if: "! contains(toJSON(github.event.commits.*.message), '[skip-ci]')"
     steps:
       - name: Checkout sources
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Install nightly toolchain
-        id: rustc-toolchain
-        uses: actions-rs/toolchain@v1
+        uses: dtolnay/rust-toolchain@master
         with:
-          profile: minimal
-          toolchain: nightly
-          override: true
+          toolchain: nightly-2024-02-01
 
-      - name: rust-cache
-        uses: actions/cache@v3
+      - name: Set up rust cache
+        uses: Swatinem/rust-cache@v2
         with:
-          path: |
-            ~/.cargo/bin/
-            ~/.cargo/registry/index/
-            ~/.cargo/registry/cache/
-            ~/.cargo/git/db/
-            target/
-          key: rustc-test-${{ steps.rustc-toolchain.outputs.rustc_hash }}-cargo-${{ hashFiles('**/Cargo.toml') }}
+            cache-on-failure: true
 
       - name: Check in plonky2 subdirectory
-        uses: actions-rs/cargo@v1
-        with:
-          command: check
-          args: --manifest-path plonky2/Cargo.toml
+        run: cargo check --manifest-path plonky2/Cargo.toml
         env:
           RUSTFLAGS: -Copt-level=3 -Cdebug-assertions -Coverflow-checks=y -Cdebuginfo=0
           RUST_LOG: 1
@@ -50,10 +46,7 @@ jobs:
           RUST_BACKTRACE: 1
 
       - name: Check in starky subdirectory
-        uses: actions-rs/cargo@v1
-        with:
-          command: check
-          args: --manifest-path starky/Cargo.toml
+        run: cargo check --manifest-path starky/Cargo.toml
         env:
           RUSTFLAGS: -Copt-level=3 -Cdebug-assertions -Coverflow-checks=y -Cdebuginfo=0
           RUST_LOG: 1
@@ -61,10 +54,7 @@ jobs:
           RUST_BACKTRACE: 1
 
       - name: Check in evm subdirectory
-        uses: actions-rs/cargo@v1
-        with:
-          command: check
-          args: --manifest-path evm/Cargo.toml
+        run: cargo check --manifest-path evm/Cargo.toml
         env:
           RUSTFLAGS: -Copt-level=3 -Cdebug-assertions -Coverflow-checks=y -Cdebuginfo=0
           RUST_LOG: 1
@@ -72,10 +62,78 @@ jobs:
           RUST_BACKTRACE: 1
 
       - name: Run cargo test
-        uses: actions-rs/cargo@v1
+        run: cargo test --workspace
+        env:
+          RUSTFLAGS: -Copt-level=3 -Cdebug-assertions -Coverflow-checks=y -Cdebuginfo=0
+          RUST_LOG: 1
+          CARGO_INCREMENTAL: 1
+          RUST_BACKTRACE: 1
+
+  wasm:
+    name: Check wasm32 compatibility
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    if: "! contains(toJSON(github.event.commits.*.message), '[skip-ci]')"
+    steps:
+      - name: Checkout sources
+        uses: actions/checkout@v4
+
+      - name: Install nightly toolchain
+        uses: dtolnay/rust-toolchain@master
         with:
-          command: test
-          args: --workspace
+          toolchain: nightly-2024-02-01
+          targets: wasm32-unknown-unknown
+
+      - name: Set up rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+            cache-on-failure: true
+
+      - name: Check in plonky2 subdirectory for wasm targets
+        run: cargo check --manifest-path plonky2/Cargo.toml --target wasm32-unknown-unknown --no-default-features
+        env:
+          RUSTFLAGS: -Copt-level=3 -Cdebug-assertions -Coverflow-checks=y -Cdebuginfo=0
+          RUST_LOG: 1
+          CARGO_INCREMENTAL: 1
+          RUST_BACKTRACE: 1
+
+      - name: Check in starky subdirectory for wasm targets
+        run: cargo check --manifest-path starky/Cargo.toml --target wasm32-unknown-unknown --no-default-features
+        env:
+          RUSTFLAGS: -Copt-level=3 -Cdebug-assertions -Coverflow-checks=y -Cdebuginfo=0
+          RUST_LOG: 1
+          CARGO_INCREMENTAL: 1
+          RUST_BACKTRACE: 1
+
+  no_std:
+    name: Test Suite in no-std
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    if: "! contains(toJSON(github.event.commits.*.message), '[skip-ci]')"
+    steps:
+      - name: Checkout sources
+        uses: actions/checkout@v4
+
+      - name: Install nightly toolchain
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: nightly-2024-02-01
+
+      - name: Set up rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+            cache-on-failure: true
+
+      - name: Run cargo test in plonky2 subdirectory (no-std)
+        run: cargo test --manifest-path plonky2/Cargo.toml --no-default-features --lib
+        env:
+          RUSTFLAGS: -Copt-level=3 -Cdebug-assertions -Coverflow-checks=y -Cdebuginfo=0
+          RUST_LOG: 1
+          CARGO_INCREMENTAL: 1
+          RUST_BACKTRACE: 1
+
+      - name: Run cargo test in starky subdirectory (no-std)
+        run: cargo test --manifest-path starky/Cargo.toml --no-default-features --lib
         env:
           RUSTFLAGS: -Copt-level=3 -Cdebug-assertions -Coverflow-checks=y -Cdebuginfo=0
           RUST_LOG: 1
@@ -85,44 +143,25 @@ jobs:
   lints:
     name: Formatting and Clippy
     runs-on: ubuntu-latest
+    timeout-minutes: 10
     if: "! contains(toJSON(github.event.commits.*.message), '[skip-ci]')"
     steps:
       - name: Checkout sources
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Install nightly toolchain
-        id: rustc-toolchain
-        uses: actions-rs/toolchain@v1
+        uses: dtolnay/rust-toolchain@master
         with:
-          profile: minimal
-          toolchain: nightly
-          override: true
+          toolchain: nightly-2024-02-01
           components: rustfmt, clippy
 
-      - name: rust-cache
-        uses: actions/cache@v3
+      - name: Set up rust cache
+        uses: Swatinem/rust-cache@v2
         with:
-          path: |
-            ~/.cargo/bin/
-            ~/.cargo/registry/index/
-            ~/.cargo/registry/cache/
-            ~/.cargo/git/db/
-            target/
-          key: rustc-lints-${{ steps.rustc-toolchain.outputs.rustc_hash }}-cargo-${{ hashFiles('**/Cargo.toml') }}
+          cache-on-failure: true
 
       - name: Run cargo fmt
-        uses: actions-rs/cargo@v1
-        with:
-          command: fmt
-          args: --all -- --check
-        env:
-          CARGO_INCREMENTAL: 1
+        run: cargo fmt --all --check
 
       - name: Run cargo clippy
-        uses: actions-rs/cargo@v1
-        with:
-          command: clippy
-          args: --all-features --all-targets -- -D warnings -A incomplete-features
-        env:
-          # Seems necessary until https://github.com/rust-lang/rust/pull/115819 is merged.
-          CARGO_INCREMENTAL: 0
+        run: cargo clippy --all-features --all-targets -- -D warnings -A incomplete-features
diff --git a/README.md b/README.md
index ab40a5c918..6ee6b82a00 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 # Plonky2 & more
+[![Discord](https://img.shields.io/discord/743511677072572486?logo=discord)](https://discord.gg/QZKRUpqCJ6)
 
 This repository was originally for Plonky2, a SNARK implementation based on techniques from PLONK and FRI. It has since expanded to include tools such as Starky, a highly performant STARK implementation.
 
diff --git a/audits/Least Authority - Polygon Zero Plonky2 Final Audit Report.pdf b/audits/Least Authority - Polygon Zero Plonky2 Final Audit Report.pdf
new file mode 100644
index 0000000000..4df70d30b9
Binary files /dev/null and b/audits/Least Authority - Polygon Zero Plonky2 Final Audit Report.pdf differ
diff --git a/evm/.cargo/katex-header.html b/evm/.cargo/katex-header.html
new file mode 100644
index 0000000000..20723b5d27
--- /dev/null
+++ b/evm/.cargo/katex-header.html
@@ -0,0 +1 @@
+../../.cargo/katex-header.html
\ No newline at end of file
diff --git a/evm/Cargo.toml b/evm/Cargo.toml
index e328aa0c97..24c560a0ed 100644
--- a/evm/Cargo.toml
+++ b/evm/Cargo.toml
@@ -2,6 +2,7 @@
 name = "plonky2_evm"
 description = "Implementation of STARKs for the Ethereum Virtual Machine"
 version = "0.1.1"
+license = "MIT or Apache-2.0"
 authors = ["Daniel Lubarov <daniel@lubarov.com>", "William Borgeaud <williamborgeaud@gmail.com>"]
 readme = "README.md"
 repository = "https://github.com/0xPolygonZero/plonky2"
@@ -13,7 +14,7 @@ edition = "2021"
 anyhow = "1.0.40"
 bytes = "1.4.0"
 env_logger = "0.10.0"
-eth_trie_utils = { git = "https://github.com/0xPolygonZero/eth_trie_utils.git", rev = "e9ec4ec2aa2ae976b7c699ef40c1ffc716d87ed5" }
+eth_trie_utils = { git = "https://github.com/0xPolygonZero/eth_trie_utils.git", rev = "7fc3c3f54b3cec9c6fc5ffc5230910bd1cb77f76" }
 ethereum-types = "0.14.0"
 hex = { version = "0.4.3", optional = true }
 hex-literal = "0.4.1"
@@ -59,3 +60,7 @@ required-features = ["asmtools"]
 [[bench]]
 name = "stack_manipulation"
 harness = false
+
+# Display math equations properly in documentation
+[package.metadata.docs.rs]
+rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
diff --git a/evm/LICENSE-APACHE b/evm/LICENSE-APACHE
new file mode 100644
index 0000000000..1b5ec8b78e
--- /dev/null
+++ b/evm/LICENSE-APACHE
@@ -0,0 +1,176 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
diff --git a/evm/LICENSE-MIT b/evm/LICENSE-MIT
new file mode 100644
index 0000000000..72dc60d84b
--- /dev/null
+++ b/evm/LICENSE-MIT
@@ -0,0 +1,19 @@
+The MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/evm/README.md b/evm/README.md
new file mode 100644
index 0000000000..a5c201550b
--- /dev/null
+++ b/evm/README.md
@@ -0,0 +1,36 @@
+# Provable Stateless ZK-EVM
+
+Included here is an implementation of a stateless, recursive ZK-EVM client implemented using Plonky2. It currently supports the full Merkle-Patricia Trie and has all Shanghai opcodes implemented.
+
+## Performance 
+
+This implementation is able to provide transaction level proofs which are then recursively aggregated into a block proof. This means that proofs for a block can be efficiently distributed across a cluster of computers. As these proofs use Plonky2 they are CPU and Memory bound. The ability to scale horizontally across transactions increases the total performance of the system dramatically. End-to-end workflows are currently in progress to support this proving mode against live evm networks.
+
+Furthermore the implementation itself is highly optimized to provide fast proving times on generally available cloud instances and does not require GPUs or special hardware.
+
+## Ethereum Compatibility
+
+The aim of this module is to initially provide full ethereum compatibility. Today, all [EVM tests](https://github.com/0xPolygonZero/evm-tests) for the Shanghai hardfork are implemented. Work is progressing on supporting the upcoming [Cancun](https://github.com/0xPolygonZero/plonky2/labels/cancun) EVM changes. Furthermore, this prover uses the full ethereum state tree and hashing modes.
+
+## Audits
+
+Audits for the ZK-EVM will begin on November 27th, 2023. See the [Audit RC1 Milestone](https://github.com/0xPolygonZero/plonky2/milestone/2?closed=1). This README will be updated with the proper branches and hashes when the audit has commenced.
+
+## Documentation / Specification
+
+The current specification is located in the [/spec](/spec) directory, with the most currently up-to-date PDF [available here](https://github.com/0xPolygonZero/plonky2/blob/main/evm/spec/zkevm.pdf). Further documentation will be made over the coming months.
+
+## License
+Copyright (c) 2023 PT Services DMCC
+
+Licensed under either of:
+* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+
+at your option. 
+
+The SPDX license identifier for this project is `MIT OR Apache-2.0`.
+
+### Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.
diff --git a/evm/spec/bibliography.bib b/evm/spec/bibliography.bib
index 41fa56b88d..1d83d297e9 100644
--- a/evm/spec/bibliography.bib
+++ b/evm/spec/bibliography.bib
@@ -18,3 +18,13 @@ @misc{plonk
     year         = {2019},
     note         = {\url{https://ia.cr/2019/953}},
 }
+
+@article{yellowpaper,
+	title={Ethereum: A secure decentralised generalised transaction ledger},
+	author={Wood, Gavin and others},
+	journal={Ethereum project yellow paper},
+	volume={151},
+	number={2014},
+	pages={1--32},
+	year={2014}
+}
diff --git a/evm/spec/cpulogic.tex b/evm/spec/cpulogic.tex
new file mode 100644
index 0000000000..318e2db487
--- /dev/null
+++ b/evm/spec/cpulogic.tex
@@ -0,0 +1,285 @@
+\section{CPU logic}
+\label{cpulogic}
+
+The CPU is in charge of coordinating the different STARKs, proving the correct execution of the instructions it reads and guaranteeing
+that the final state of the EVM corresponds to the starting state after executing the input transaction. All design choices were made 
+to make sure these properties can be adequately translated into constraints of degree at most 3 while minimizing the size of the different
+table traces (number of columns and number of rows).
+
+In this section, we will detail some of these choices.
+
+\subsection{Kernel}
+The kernel is in charge of the proving logic. This section aims at providing a high level overview of this logic. For details about any specific part of the logic, one can consult the various ``asm'' files in the \href{https://github.com/0xPolygonZero/plonky2/tree/main/evm/src/cpu/kernel}{``kernel'' folder}.
+
+We prove one transaction at a time. These proofs can later be aggregated recursively to prove a block. Proof aggregation is however not in the scope of this section. Here, we assume that we have an initial state of the EVM, and we wish to prove that a single transaction was correctly executed, leading to a correct update of the state.
+
+Since we process one transaction at a time, a few intermediary values need to be provided by the prover. Indeed, to prove that the registers in the EVM state are correctly updated, we need to have access to their initial values. When aggregating proofs, we can also constrain those values to match from one transaction to the next. Let us consider the example of the transaction number. Let $n$ be the number of transactions executed so far in the current block. If the current proof is not a dummy one (we are indeed executing a transaction), then the transaction number should be updated: $n := n+1$. Otherwise, the number remains unchanged. We can easily constrain this update. When aggregating the previous transaction proof ($lhs$) with the current one ($rhs$), we also need to check that the output transaction number of $lhs$ is the same as the input transaction number of $rhs$. 
+
+Those prover provided values are stored in memory prior to entering the kernel, and are used in the kernel to assert correct updates. The list of prover provided values necessary to the kernel is the following:
+\begin{enumerate}
+    \item the previous transaction number: $t_n$,
+    \item the gas used before executing the current transaction: $g\_u_0$,
+    \item the gas used after executing the current transaction: $g\_u_1$,
+    \item the state, transaction and receipts MPTs before executing the current transaction: $\texttt{tries}_0$,
+    \item the hash of all MPTs before executing the current transaction: $\texttt{digests}_0$,
+    \item the hash of all MPTs after executing the current transaction: $\texttt{digests}_1$,
+    \item the RLP encoding of the transaction.
+\end{enumerate}
+
+\paragraph*{Initialization:} The first step consists in initializing:
+\begin{itemize}
+    \item The shift table: it maps the number of bit shifts $s$ with its shifted value $1 << s$. Note that $0 \leq s \leq 255$.
+    \item The initial MPTs: the initial state, transaction and receipt tries $\texttt{tries}_0$ are loaded from memory and hashed. The hashes are then compared to $\texttt{digests}\_0$.
+    \item We load the transaction number $t\_n$ and the current gas used $g\_u_0$ from memory.
+\end{itemize}
+
+If no transaction is provided, we can halt after this initialization. Otherwise, we start processing the transaction. The transaction is provided as its RLP encoding. We can deduce the various transaction fields (such as its type or the transfer value) from its encoding. Based on this, the kernel updates the state trie by executing the transaction. Processing the transaction also includes updating the transactions MPT with the transaction at hand.
+
+The processing of the transaction returns a boolean ``success'' that indicates whether the transaction was executed successfully, along with the leftover gas. 
+
+The following step is then to update the receipts MPT. Here, we update the transaction's bloom filter. We store ``success'', the leftover gas, the transaction bloom filter and the logs in memory. We also store some additional information that facilitates the RLP encoding of the receipts later.
+
+If there are any withdrawals, they are performed at this stage.
+
+Finally, once the three MPTs have been updated, we need to carry out final checks:
+\begin{itemize}
+    \item the gas used after the execution is equal to $g\_u_1$,
+    \item the new transaction number is $n+1$ if there was a transaction,
+    \item the three MPTs are hashed and checked against $\texttt{digests}_1$.
+\end{itemize}
+Once those final checks are performed, the program halts.
+
+\subsection{Simple opcodes VS Syscalls}
+For simplicity and efficiency, EVM opcodes are categorized into two groups: ``simple opcodes'' and ``syscalls''. Simple opcodes are generated directly in Rust, in \href{https://github.com/0xPolygonZero/plonky2/blob/main/evm/src/witness/operation.rs}{operation.rs}. Every call to a simple opcode adds exactly one row to the \href{https://github.com/0xPolygonZero/plonky2/blob/main/evm/spec/tables/cpu.tex}{cpu table}. Syscalls are more complex structures written with simple opcodes, in the kernel.
+
+Whenever we encounter a syscall, we switch to kernel mode and execute its associated code. At the end of each syscall, we run EXIT\_KERNEL, which resets the kernel mode to its state right before the syscall. It also sets the PC to point to the opcode right after the syscall.
+
+Exceptions are handled differently for simple opcodes and syscalls. When necessary, simple opcodes throw an exception (see \ref{exceptions}). This activates the ``exception flag'' in the CPU and runs the exception operations. On the other hand, syscalls handle exceptions in the kernel directly.
+
+\subsection{Privileged instructions}
+
+To ease and speed-up proving time, the zkEVM supports custom, privileged instructions that can only be executed by the kernel.
+Any appearance of those privileged instructions in a contract bytecode for instance would result in an unprovable state.
+
+In what follows, we denote by $p_{BN}$ the characteristic of the BN254 curve base field, curve for which Ethereum supports the 
+ecAdd, ecMul and ecPairing precompiles.
+
+\begin{enumerate}[align=left]
+  \item[0x0C.] \texttt{ADDFP254}. Pops 2 elements from the stack interpreted as BN254 base field elements, and pushes their addition modulo $p_{BN}$ onto the stack.
+
+  \item[0x0D.] \texttt{MULFP254}. Pops 2 elements from the stack interpreted as BN254 base field elements, and pushes their product modulo $p_{BN}$ onto the stack.
+
+  \item[0x0E.] \texttt{SUBFP254}. Pops 2 elements from the stack interpreted as BN254 base field elements, and pushes their difference modulo $p_{BN}$ onto the stack.
+  This instruction behaves similarly to the SUB (0x03) opcode, in that we subtract the second element of the stack from the initial (top) one.
+
+  \item[0x0F.] \texttt{SUBMOD}. Pops 3 elements from the stack, and pushes the modular difference of the first two elements of the stack by the third one.
+  It is similar to the SUB instruction, with an extra pop for the custom modulus.
+
+  \item[0x21.] \texttt{KECCAK\_GENERAL}. Pops 2 elements (a Memory address, followed by a length $\ell$) and pushes the hash of the memory portion starting at the
+  constructed address and of length $\ell$. It is similar to KECCAK256 (0x20) instruction, but can be applied to any memory section (i.e. even privileged ones).
+
+  \item[0x49.] \texttt{PROVER\_INPUT}. Pushes a single prover input onto the stack.
+
+  \item[0xC0-0xDF.] \texttt{MSTORE\_32BYTES}. Pops 2 elements from the stack (a Memory address, and then a value), and pushes
+  a new address' onto the stack. The value is being decomposed into bytes and written to memory, starting from the fetched address. The new address being pushed is computed as the
+  initial address + the length of the byte sequence being written to memory. Note that similarly to PUSH (0x60-0x7F) instructions, there are 32 MSTORE\_32BYTES instructions, each
+  corresponding to a target byte length (length 0 is ignored, for the same reasons as MLOAD\_32BYTES, see below). Writing to memory an integer fitting in $n$ bytes with a length $\ell < n$ will
+  result in the integer being truncated. On the other hand, specifying a length $\ell$ greater than the byte size of the value being written will result in padding with zeroes. This
+  process is heavily used when resetting memory sections (by calling MSTORE\_32BYTES\_32 with the value 0).
+
+  \item[0xF6.] \texttt{GET\_CONTEXT}. Pushes the current context onto the stack. The kernel always has context 0.
+
+  \item[0xF7.] \texttt{SET\_CONTEXT}. Pops the top element of the stack and updates the current context to this value. It is usually used when calling another contract or precompile,
+  to distinguish the caller from the callee.
+
+  \item[0xF8.] \texttt{MLOAD\_32BYTES}. Pops 2 elements from the stack (a Memory address, and then a length $\ell$), and pushes
+  a value onto the stack. The pushed value corresponds to the U256 integer read from the big-endian sequence of length $\ell$ from the memory address being fetched. Note that an
+  empty length is not valid, nor is a length greater than 32 (as a U256 consists in at most 32 bytes). Missing these conditions will result in an unverifiable proof.
+
+  \item[0xF9.] \texttt{EXIT\_KERNEL}. Pops 1 element from the stack. This instruction is used at the end of a syscall, before proceeding to the rest of the execution logic.
+  The popped element, \textit{kexit\_info}, contains several pieces of information like the current program counter, the current amount of gas used, and whether we are in kernel (i.e. privileged) mode or not.
+
+  \item[0xFB.] \texttt{MLOAD\_GENERAL}. Pops 1 elements (a Memory address), and pushes the value stored at this memory
+  address onto the stack. It can read any memory location, general (similarly to MLOAD (0x51) instruction) or privileged.
+
+  \item[0xFC.] \texttt{MSTORE\_GENERAL}. Pops 2 elements (a value and a Memory address), and writes the popped value from
+  the stack at the fetched address. It can write to any memory location, general (similarly to MSTORE (0x52) / MSTORE8 (0x53) instructions) or privileged.
+\end{enumerate}
+
+
+\subsection{Memory addresses}
+\label{memoryaddresses}
+
+Kernel operations deal with memory addresses as single U256 elements.
+However, when processing the operations to generate the proof witness, the CPU will decompose these into three components:
+
+\begin{itemize}
+  \item[context.] The context of the memory address. The Kernel context is special, and has value 0.
+
+  \item[segment.] The segment of the memory address, corresponding to a specific section given a context (eg. MPT data, global metadata, etc.).
+
+  \item[virtual.] The offset of the memory address, within a segment given a context.
+\end{itemize}
+
+To easily retrieve these components, we scale them so that they can represent a memory address as:
+
+$$ \mathrm{addr} = 2^{64} \cdot \mathrm{context} + 2^{32} \cdot \mathrm{segment} + \mathrm{offset}$$
+
+This allows to easily retrieve each component individually once a Memory address has been decomposed into 32-bit limbs.
+
+
+\subsection{Stack handling}
+\label{stackhandling}
+
+\subsubsection{Top of the stack}
+
+The majority of memory operations involve the stack. The stack is a segment in memory, and stack operations (popping or pushing) use the memory channels.
+Every CPU instruction performs between 0 and 3 pops, and may push at most once. However, for efficiency purposes, we hold the top of the stack in
+the first memory channel \texttt{current\_row.mem\_channels[0]}, only writing it in memory if necessary.
+
+\paragraph*{Motivation:}
+
+See \href{https://github.com/0xPolygonZero/plonky2/issues/1149}{this issue}.
+
+\paragraph*{Top reading and writing:}
+
+When a CPU instruction modifies the stack, it must update the top of the stack accordingly. There are three cases.
+
+\begin{itemize}
+  \item \textbf{The instruction pops and pushes:} The new top of the stack is stored in \texttt{next\_row.mem\_channels[0]}; it may be computed by the instruction,
+or it could be read from memory. In either case, the instruction is responsible for setting \texttt{next\_row.mem\_channels[0]}'s flags and address columns correctly.
+After use, the previous top of the stack is discarded and doesn't need to be written in memory.
+  \item \textbf{The instruction pushes, but doesn't pop:} The new top of the stack is stored in \texttt{next\_row.mem\_channels[0]}; it may be computed by the instruction,
+or it could be read from memory. In either case, the instruction is responsible for setting \texttt{next\_row.mem\_channels[0]}'s flags and address columns correctly.
+If the stack wasn't empty (\texttt{current\_row.stack\_len > 0}), the instruction performs a memory read in \texttt{current\_row.partial\_ channel}. \texttt{current\_row.partial\_channel}
+shares its values with \texttt{current\_ row.mem\_channels[0]} (which holds the current top of the stack). If the stack was empty, \texttt{current\_row.partial\_channel}
+is disabled.
+  \item \textbf{The instruction pops, but doesn't push:} After use, the current top of the stack is discarded and doesn't need to be written in memory.
+If the stack isn't empty now (\texttt{current\_row.stack\_len > num\_pops}), the new top of the stack is set in \texttt{next\_row.mem\_channels[0]}
+with a memory read from the stack segment. If the stack is now empty, \texttt{next\_row.mem\_channels[0]} is disabled.
+\end{itemize}
+
+In the last two cases, there is an edge case if \texttt{current\_row.stack\_len} is equal to a \texttt{special\_len}. For a strictly pushing instruction,
+this happens if the stack is empty, and \texttt{special\_len = 0}. For a strictly popping instruction, this happens if the next stack is empty, i.e. if
+all remaining elements are popped, and \texttt{special\_len = num\_pops}. Note that we do not need to check for values below \texttt{num\_pops}, since this
+would be a stack underflow exception which is handled separately.
+The edge case is detected with the compound flag
+$$\texttt{1 - not\_special\_len * stack\_inv\_aux,}$$
+where $$\texttt{not\_special\_len = current\_row - special\_len}$$
+
+
+and \texttt{stack\_inv\_aux} is constrained to be the modular inverse of \texttt{not\_special\_ len} if it's non-zero, or 0 otherwise. The flag is 1
+if \texttt{stack\_len} is equal to \texttt{special\_len}, and 0 otherwise.
+
+This logic can be found in code in the \texttt{eval\_packed\_one} function of \href{https://github.com/0xPolygonZero/plonky2/blob/main/evm/src/cpu/stack.rs}{stack.rs}.
+The function multiplies all of the stack constraints with the degree 1 filter associated with the current instruction.
+
+\paragraph*{Operation flag merging:}
+
+To reduce the total number of columns, many operation flags are merged together (e.g. \texttt{DUP} and \texttt{SWAP}) and are distinguished with the binary decomposition of their opcodes.
+The filter for a merged operation is now of degree 2: for example, \texttt{is\_swap = dup\_swap * opcode\_bits[4]} since the 4th bit is set to 1 for a \texttt{SWAP} and 0 for a \texttt{DUP}.
+If the two instructions have different stack behaviors, this can be a problem: \texttt{eval\_packed\_one}'s constraints are already of degree 3 and it can't support degree 2 filters.
+
+When this happens, stack constraints are defined manually in the operation's dedicated file (e.g. \texttt{dup\_swap.rs}). Implementation details vary case-by-case and can be found in the files.
+
+\subsubsection{Stack length checking}
+
+The CPU must make sure that the stack length never goes below zero and, in user mode, never grows beyond the maximum stack size. When this happens, an honest prover should trigger the
+corresponding exception. If a malicious prover doesn't trigger the exception, constraints must fail the proof.
+
+\paragraph*{Stack underflow:}
+There is no explicit constraint checking for stack underflow. An underflow happens when the CPU tries to pop the empty stack, which would perform a memory read at virtual address \texttt{-1}.
+Such a read cannot succeed: in Memory, the range-check argument requires the gap between two consecutive addresses to be lower than the length of the Memory trace. Since the prime of the Plonky2 field is 64-bit long,
+this would require a Memory trace longer than $2^{32}$.
+
+\paragraph*{Stack overflow:}
+An instruction can only push at most once, meaning that an overflow occurs whenever the stack length is exactly one more than the maximum stack size ($1024+1$) in user mode.
+To constrain this, the column \texttt{stack\_len\_bounds\_aux} contains:
+
+\begin{itemize}
+  \item[--] the modular inverse of \texttt{stack\_len - 1025} if we're in user mode and \texttt{stack\_len $\neq$ 1025},
+  \item[--] 0 if \texttt{stack\_len = 1025} or if we're in kernel mode.
+\end{itemize}
+Then overflow can be checked with the flag
+$$\texttt{(1 - is\_kernel\_mode) - stack\_len * stack\_len\_bounds\_aux}.$$
+The flag is 1 if \texttt{stack\_len = 1025} and we're in user mode, and 0 otherwise.
+
+Because \texttt{stack\_len\_bounds\_aux} is a shared general column, we only check this constraint after an instruction that can actually trigger an overflow,
+i.e. a pushing, non-popping instruction.
+
+\subsection{Gas handling}
+
+\subsubsection{Out of gas errors}
+
+The CPU table has a ``gas'' register that keeps track of the gas used by the transaction so far.
+
+The crucial invariant in our out-of-gas checking method is that at any point in the program's execution, we have not used more gas than we have available; that is ``gas'' is at most the gas allocation for the transaction (which is stored separately by the kernel). We assume that the gas allocation will never be $2^{32}$ or more, so if ``gas'' does not fit in one limb, then we've run out of gas.
+
+When a native instruction (one that is not a syscall) is executed, a constraint ensures that the ``gas'' register is increased by the correct amount. This is not automatic for syscalls; the syscall handler itself must calculate and charge the appropriate amount.
+
+If everything goes smoothly and we have not run out of gas, ``gas'' should be no more than the gas allowance at the point that we STOP, REVERT, stack overflow, or whatever. Indeed, because we assume that the gas overflow handler is invoked \textit{as soon as} we've run out of gas, all these termination methods verify that $\texttt{gas} \leq \texttt{allowance}$, and jump to \texttt{exc\_out\_of\_gas} if this is not the case. This is also true for the out-of-gas handler, which checks that: 
+\begin{enumerate}
+  \item we have not yet run out of gas
+  \item we are about to run out of gas
+\end{enumerate} 
+and ``PANIC'' if either of those statements does not hold.
+
+When we do run out of gas, however, this event must be handled. Syscalls are responsible for checking that their execution would not cause the transaction to run out of gas. If the syscall detects that it would need to charge more gas than available, it aborts the transaction (or the current code) by jumping to \texttt{fault\_exception}. In fact, \texttt{fault\_exception} is in charge of handling all exceptional halts in the kernel.
+
+Native instructions do this differently. If the prover notices that execution of the instruction would cause an out-of-gas error, it must jump to the appropriate handler instead of executing the instruction. (The handler contains special code that PANICs if the prover invoked it incorrectly.)
+
+\subsubsection{Overflow}
+
+We must be careful to ensure that ``gas'' does not overflow to prevent denial of service attacks.
+
+Note that a syscall cannot be the instruction that causes an overflow. This is because every syscall is required to verify that its execution does not cause us to exceed the gas limit. Upon entry into a syscall, a constraint verifies that $\texttt{gas} < 2^{32}$. Some syscalls may have to be careful to ensure that the gas check is performed correctly (for example, that overflow modulo $2^{256}$ does not occur). So we can assume that upon entry and exit out of a syscall, $\texttt{gas} < 2^{32}$.
+
+Similarly, native instructions alone cannot cause wraparound. The most expensive instruction, JUMPI, costs 10 gas. Even if we were to execute $2^{32}$ consecutive JUMPI instructions, the maximum length of a trace, we are nowhere close to consuming $2^{64} - 2^{32} + 1$ (= Goldilocks prime) gas.
+
+The final scenario we must tackle is an expensive syscall followed by many expensive native instructions. Upon exit from a syscall, $\texttt{gas} < 2^{32}$. Again, even if that syscall is followed by $2^{32}$ native instructions of cost 10, we do not see wraparound modulo Goldilocks.
+
+
+\subsection{Exceptions}
+\label{exceptions}
+
+Sometimes, when executing user code (i.e. contract or transaction code), the EVM halts exceptionally (i.e. outside of a STOP, a RETURN or a REVERT).
+When this happens, the CPU table invokes a special instruction with a dedicated operation flag \texttt{exception}.
+Exceptions can only happen in user mode; triggering an exception in kernel mode would make the proof unverifiable.
+No matter the exception, the handling is the same:
+
+-- The opcode which would trigger the exception is not executed. The operation flag set is \texttt{exception} instead of the opcode's flag.
+
+-- We push a value to the stack which contains: the current program counter (to retrieve the faulty opcode), and the current value of \texttt{gas\_used}.
+The program counter is then set to the corresponding exception handler in the kernel (e.g. \texttt{exc\_out\_of\_gas}).
+
+-- The exception handler verifies that the given exception would indeed be triggered by the faulty opcode. If this is not the case (if the exception has already happened or if it doesn't happen after executing
+the faulty opcode), then the kernel panics: there was an issue during witness generation.
+
+-- The kernel consumes the remaining gas and returns from the current context with \texttt{success} set to 0 to indicate an execution failure.
+
+Here is the list of the possible exceptions:
+
+\begin{enumerate}[align=left]
+  \item[\textbf{Out of gas:}] Raised when a native instruction (i.e. not a syscall) in user mode pushes the amount of gas used over the current gas limit.
+When this happens, the EVM jumps to \texttt{exc\_out\_of\_gas}. The kernel then checks that the consumed gas is currently below the gas limit,
+and that adding the gas cost of the faulty instruction pushes it over it.
+If the exception is not raised, the prover will panic when returning from the execution: the remaining gas is checked to be positive after STOP, RETURN or REVERT.
+  \item[\textbf{Invalid opcode:}] Raised when the read opcode is invalid. It means either that it doesn't exist, or that it's a privileged instruction and
+thus not available in user mode. When this happens, the EVM jumps to \texttt{exc\_invalid\_opcode}. The kernel then checks that the given opcode is indeed invalid.
+If the exception is not raised, decoding constraints ensure no operation flag is set to 1, which would make it a padding row. Halting constraints would then make the proof
+unverifiable.
+  \item[\textbf{Stack underflow:}] Raised when an instruction which pops from the stack is called when the stack doesn't have enough elements.
+When this happens, the EVM jumps to \texttt{exc\_stack\_overflow}. The kernel then checks that the current stack length is smaller than the minimum 
+stack length required by the faulty opcode.
+If the exception is not raised, the popping memory operation's address offset would underflow, and the Memory range check would require the Memory trace to be too
+large ($>2^{32}$).
+  \item[\textbf{Invalid JUMP destination:}] Raised when the program counter jumps to an invalid location (i.e. not a JUMPDEST). When this happens, the EVM jumps to
+\texttt{exc\_invalid\_jump\_destination}. The kernel then checks that the opcode is a JUMP, and that the destination is not a JUMPDEST by checking the
+JUMPDEST segment.
+If the exception is not raised, jumping constraints will fail the proof.
+  \item[\textbf{Invalid JUMPI destination:}] Same as the above, for JUMPI.
+  \item[\textbf{Stack overflow:}] Raised when a pushing instruction in user mode pushes the stack over 1024. When this happens, the EVM jumps
+to \texttt{exc\_stack\_overflow}. The kernel then checks that the current stack length is exactly equal to 1024 (since an instruction can only
+push once at most), and that the faulty instruction is pushing.
+If the exception is not raised, stack constraints ensure that a stack length of 1025 in user mode will fail the proof.
+\end{enumerate}
diff --git a/evm/spec/framework.tex b/evm/spec/framework.tex
index d99a31bbee..c20e46db67 100644
--- a/evm/spec/framework.tex
+++ b/evm/spec/framework.tex
@@ -30,8 +30,130 @@ \subsection{Field selection}
 
 At this point we have reduced $n$ to a \texttt{u64}. This partial reduction is adequate for most purposes, but if we needed the result in canonical form, we would perform a final conditional subtraction.
 
-
 \subsection{Cross-table lookups}
 \label{ctl}
+The various STARK tables carry out independent operations, but on shared values. We need to check that the shared values are identical in all the STARKs that require them. This is where cross-table lookups (CTLs) come in handy. 
+
+Suppose STARK $S_1$ requires an operation -- say $Op$ -- that is carried out by another STARK $S_2$. Then $S_1$ writes the input and output of $Op$ in its own table, and provides the inputs to $S_2$. $S_2$ also writes the inputs and outputs in its rows, and the table's constraints check that $Op$ is carried out correctly. We then need to ensure that the inputs and outputs are the same in $S_1$ and $S_2$.
+
+In other words, we need to ensure that the rows -- reduced to the input and output columns -- of $S_1$ calling $Op$ are permutations of the rows of $S_2$ that carry out $Op$. Our CTL protocol is based on logUp and is similar to our range-checks.
+
+To prove this, the first step is to only select the rows of interest in $S_1$ and $S_2$, and filter out the rest. Let $f^1$ be the filter for $S_1$ and $f^2$ the filter for $S_2$. $f^1$ and $f^2$ are constrained to be in $\{0, 1\}$. $f^1 = 1$ (resp. $f^2 = 1$) whenever the row at hand carries out $Op$ in $S_1$ (resp. in $S_2$), and 0 otherwise. Let also $(\alpha, \beta)$ be two random challenges.
+
+The idea is to create subtables $S_1'$ and $S_2'$ of $S_1$ and $S_2$ respectively, such that $f^1 = 1$ and $f^2 = 1$ for all their rows. The columns in the subtables are limited to the ones whose values must be identical (the inputs and outputs of $Op$ in our example).
+
+Note that for design and constraint reasons, filters are limited to (at most) degree 2 combinations of columns.
+
+Let $\{c^{1, i}\}_{i=1}^m$ be the columns in $S_1'$ an $\{c^{2,i}\}_{i=1}^m$ be the columns in $S_2'$.
+
+The prover defines a ``running sum'' $Z$ for $S_1'$ such that:
+\begin{gather*}
+  Z^{S_1}_{n-1} = \frac{1}{\sum_{j=0}^{m-1} \alpha^j \cdot c^{1, j}_{n-1} + \beta} \\
+  Z^{S_1}_{i+1} = Z^{S_1}_i + f^1_i \cdot \frac{1}{\sum_{j=0}^{m-1} \alpha^j \cdot c^{1, j}_i + \beta}
+\end{gather*}
+The second equation ``selects'' the terms of interest thanks to $f^1$ and filters out the rest.
+
+Similarly, the prover constructs a running sum $Z^{S_2}$for $S_2$. Note that $Z$ is computed ``upside down'': we start with $Z_{n-1}$ and the final sum is in $Z_0$. 
+
+On top of the constraints to check that the running sums were correctly constructed, the verifier checks that $Z^{S_1}_0 = Z^{S_2}_0$.
+This ensures that the columns in $S_1'$ and the columns in $S_2'$ are permutations of each other.
+
+In other words, the CTL argument is a logUp lookup argument where $S_1'$ is the looking table, $S_2'$ is the looked table, and $S_1' = S_2'$ (all the multiplicities are 1).
+For more details about logUp, see the next section.
+
+To sum up, for each STARK $S$, the prover:
+\begin{enumerate}
+  \item constructs a running sum $Z_i^l$ for each table looking into $S$ (called looking sums here),
+  \item constructs a running sum $Z^S$ for $S$ (called looked sum here),
+  \item sends the final value for each running sum $Z_{i, 0}^l$ and $Z^S_0$ to the verifier,
+  \item sends a commitment to $Z_i^l$  and $Z^S$ to the verifier.
+\end{enumerate}
+Then, for each STARK $S$, the verifier:
+\begin{enumerate}
+  \item computes the sum $Z = \sum_i Z_{i, 0}^l$,
+  \item checks that $Z = Z^S_0$,
+  \item checks that each $Z_i^l$  and $Z^S$ was correctly constructed.
+\end{enumerate}
+
+
+\subsection{Range-checks}
+\label{rc}
+In most cases, tables deal with U256 words, split into 32-bit limbs (to avoid overflowing the field). To prevent a malicious prover from cheating, it is crucial to range-check those limbs. 
+\subsubsection{What to range-check?}
+One can note that every element that ever appears on the stack has been pushed. Therefore, enforcing a range-check on pushed elements is enough to range-check all elements on the stack. Similarly, all elements in memory must have been written prior, and therefore it is enough to range-check memory writes. However, range-checking the PUSH and MSTORE opcodes is not sufficient.
+\begin{enumerate}
+  \item Pushes and memory writes for ``MSTORE\_32BYTES'' are range-checked in ``BytePackingStark''.
+  \item Syscalls, exceptions and prover inputs are range-checked in ``ArithmeticStark''.
+  \item The inputs and outputs of binary and ternary arithmetic operations are range-checked in ``ArithmeticStark''.
+  \item The inputs' bits of logic operations are checked to be either 1 or 0 in ``LogicStark''. Since ``LogicStark'' only deals with bitwise operations, this is enough to have range-checked outputs as well.
+  \item The inputs of Keccak operations are range-checked in ``KeccakStark''. The output digest is written as bytes in ``KeccakStark''. Those bytes are used to reconstruct the associated 32-bit limbs checked against the limbs in ``CpuStark''. This implicitly ensures that the output is range-checked.
+\end{enumerate}
+Note that some operations do not require a range-check:
+\begin{enumerate}
+  \item ``MSTORE\_GENERAL'' read the value to write from the stack. Thus, the written value was already range-checked by a previous push.
+  \item ``EQ'' reads two -- already range-checked -- elements on the stack, and checks they are equal. The output is either 0 or 1, and does therefore not need to be checked.
+  \item ``NOT'' reads one -- already range-checked -- element. The result is constrained to be equal to $\texttt{0xFFFFFFFF} - \texttt{input}$, which implicitly enforces the range check.
+  \item ``PC'': the program counter cannot be greater than $2^{32}$ in user mode. Indeed, the user code cannot be longer than $2^{32}$, and jumps are constrained to be JUMPDESTs. Moreover, in kernel mode, every jump is towards a location within the kernel, and the kernel code is smaller than $2^{32}$. These two points implicitly enforce $PC$'s range check.
+  \item ``GET\_CONTEXT'', ``DUP'' and ``SWAP'' all read and push values that were already written in memory. The pushed values were therefore already range-checked.
+\end{enumerate}
+Range-checks are performed on the range $[0, 2^{16} - 1]$, to limit the trace length.
+
+\subsubsection{Lookup Argument}
+To enforce the range-checks, we leverage \href{https://eprint.iacr.org/2022/1530.pdf}{logUp}, a lookup argument by Ulrich Häbock. Given a looking table $s = (s_1, ..., s_n)$ and a looked table $t = (t_1, ..., t_m)$, the goal is to prove that 
+$$\forall 1 \leq i \leq n, \exists 1 \leq j \leq r \texttt{ such that } s_i = t_j$$
+In our case, $t = (0, .., 2^{16} - 1)$ and $s$ is composed of all the columns in each STARK that must be range-checked. 
+
+The logUp paper explains that proving the previous assertion is actually equivalent to proving that there exists a sequence $l$ such that:
+$$ \sum_{i=1}^n \frac{1}{X - s_i} = \sum_{j=1}^r \frac{l_j}{X-t_j}$$
+
+The values of $s$ can be stored in $c$ different columns of length $n$ each. In that case, the equality becomes:
+$$\sum_{k=1}^c \sum_{i=1}^n \frac{1}{X - s_i^k} = \sum_{j=1}^r \frac{l_j}{X-t_j}$$
+
+The `multiplicity' $m_i$ of value $t_i$ is defined as the number of times $t_i$ appears in $s$. In other words:
+$$m_i = |s_j \in s; s_j = t_i|$$
+
+Multiplicities provide a valid sequence of values in the previously stated equation. Thus, if we store the multiplicities, and are provided with a challenge $\alpha$, we can prove the lookup argument by ensuring:
+$$\sum_{k=1}^c \sum_{i=1}^n \frac{1}{\alpha - s_i^k} = \sum_{j=1}^r \frac{m_j}{\alpha-t_j}$$
+However, the equation is too high degree. To circumvent this issue, Häbock suggests providing helper columns $h_i$ and $d$ such that at a given row $i$:
+\begin{gather*}
+  h_i^k = \frac{1}{\alpha + s_i^k } \forall 1 \leq k \leq c \\
+  d_i = \frac{1}{\alpha + t_i}
+\end{gather*}
+
+The $h$ helper columns can be batched together to save columns. We can batch at most $\texttt{constraint\_degree} - 1$ helper functions together. In our case, we batch them 2 by 2. At row $i$, we now have:
+\begin{align*}
+  h_i^k = \frac{1}{\alpha + s_i^{2k}} + \frac{1}{\alpha + s_i^{2k+1}} \forall 1 \leq k \leq c/2 \\
+\end{align*}
+If $c$ is odd, then we have one extra helper column:
+$$h_i^{c/2+1} = \frac{1}{\alpha + s_i^{c}}$$
+
+For clarity, we will assume that $c$ is even in what follows.
+
+Let $g$ be a generator of a subgroup of order $n$. We extrapolate $h, m$ and $d$ to get polynomials such that, for $f \in \{h^k, m, g\}$: $f(g^i) = f_i$.
+We can define the following polynomial:
+$$ Z(x) :=  \sum_{i=1}^n \big[\sum_{k=1}^{c/2} h^k(x) - m(x) * d(x)\big]$$
+
+
+\subsubsection{Constraints}
+With these definitions and a challenge $\alpha$, we can finally check that the assertion holds with the following constraints:
+\begin{gather*}
+  Z(1) = 0 \\
+  Z(g \alpha) = Z(\alpha) + \sum_{k=1}^{c/2} h^k(\alpha) - m(\alpha) d(\alpha)
+\end{gather*}
+These ensure that 
+We also need to ensure that $h^k$ is well constructed for all $1 \leq k \leq c/2$:
+$$
+  h(\alpha)^k \cdot (\alpha + s_{2k}) \cdot (\alpha + s_{2k+1}) = (\alpha + s_{2k}) + (\alpha + s_{2k+1})
+$$
+
+Note: if $c$ is odd, we have one unbatched helper column $h^{c/2+1}$ for which we need a last constraint:
+$$
+  h(\alpha)^{c/2+1} \cdot (\alpha + s_{c}) = 1
+$$
 
-TODO
+Finally, the verifier needs to ensure that the table $t$ was also correctly computed. In each STARK, $t$ is computed starting from 0 and adding at most 1 at each row. This construction is constrained as follows:
+\begin{enumerate}
+  \item $t(1) = 0$
+  \item $(t(g^{i+1}) - t(g^{i})) \cdot ((t(g^{i+1}) - t(g^{i})) - 1) = 0$
+  \item $t(g^{n-1}) = 2^{16} - 1$
+\end{enumerate}
diff --git a/evm/spec/instructions.tex b/evm/spec/instructions.tex
deleted file mode 100644
index ea09698271..0000000000
--- a/evm/spec/instructions.tex
+++ /dev/null
@@ -1,8 +0,0 @@
-\section{Privileged instructions}
-\label{privileged-instructions}
-
-\begin{enumerate}
-  \item[0xFB.] \texttt{MLOAD\_GENERAL}. Returns
-  \item[0xFC.] \texttt{MSTORE\_GENERAL}. Returns
-  \item[TODO.] \texttt{STACK\_SIZE}. Returns
-\end{enumerate}
diff --git a/evm/spec/mpts.tex b/evm/spec/mpts.tex
index 49d1d32863..3f6733a535 100644
--- a/evm/spec/mpts.tex
+++ b/evm/spec/mpts.tex
@@ -1,9 +1,16 @@
-\section{Merkle Patricia tries}
+\section{Merkle Patricia Tries}
 \label{tries}
+The \emph{EVM World state} is a representation of the different accounts at a particular time, as well as the last processed transactions together with their receipts. The world state is represented using \emph{Merkle Patricia Tries} (MPTs) \cite[App.~D]{yellowpaper}, and there are three different tries: the state trie, the transaction trie and the receipt trie.
+
+For each transaction we need to show that the prover knows preimages of the hashed initial and final EVM states.  When the kernel starts execution, it stores these three tries within the {\tt Segment::TrieData} segment. The prover loads the initial tries from the inputs into memory. Subsequently, the tries are modified during transaction execution, inserting new nodes or deleting existing nodes. 
+
+An MPT is composed of five different nodes: branch, extension, leaf, empty and digest nodes. Branch and leaf nodes might contain a payload whose format depends on the particular trie. The nodes are encoded, primarily using RLP encoding and Hex-prefix encoding (see \cite{yellowpaper} App. B and C, respectively). The resulting encoding is then hashed, following a strategy similar to that of normal Merkle trees, to generate the trie hashes.
+
+Insertion and deletion is performed in the same way as other MPTs implementations. The only difference is for inserting extension nodes where we create a new node with the new data, instead of modifying the existing one. In the rest of this section we describe how the MPTs are represented in memory, how they are given as input, and how MPTs are hashed.
 
 \subsection{Internal memory format}
 
-Withour our zkEVM's kernel memory,
+The tries are stored in kernel memory, specifically in the {\tt Segment:TrieData} segment. Each node type is stored as
 \begin{enumerate}
   \item An empty node is encoded as $(\texttt{MPT\_NODE\_EMPTY})$.
   \item A branch node is encoded as $(\texttt{MPT\_NODE\_BRANCH}, c_1, \dots, c_{16}, v)$, where each $c_i$ is a pointer to a child node, and $v$ is a pointer to a value. If a branch node has no associated value, then $v = 0$, i.e. the null pointer.
@@ -12,15 +19,76 @@ \subsection{Internal memory format}
   \item A digest node is encoded as $(\texttt{MPT\_NODE\_HASH}, d)$, where $d$ is a Keccak256 digest.
 \end{enumerate}
 
+On the other hand the values or payloads are represented differently depending on the particular trie.
+
+\subsubsection{State trie}
+The state trie payload contains the account data. Each account is stored in 4 contiguous memory addresses containing
+\begin{enumerate}
+	\item the nonce,
+	\item the balance,
+	\item a pointer to the account's storage trie,
+	\item a hash of the account's code.
+\end{enumerate}
+The storage trie payload in turn is a single word.
+
+\subsubsection{Transaction Trie}
+The transaction trie nodes contain the length of the RLP encoded transaction, followed by the bytes of the RLP encoding of the transaction.
+
+\subsubsection{Receipt Trie}
+The payload of the receipts trie is a receipt. Each receipt is stored as
+\begin{enumerate}
+	\item the length in words of the payload,
+	\item the status,
+	\item the cumulative gas used,
+	\item the bloom filter, stored as 256 words.
+	\item the number of topics,
+	\item the topics
+	\item the data length,
+	\item the data.
+\end{enumerate}
+
 
 \subsection{Prover input format}
 
 The initial state of each trie is given by the prover as a nondeterministic input tape. This tape has a slightly different format:
 \begin{enumerate}
   \item An empty node is encoded as $(\texttt{MPT\_NODE\_EMPTY})$.
-  \item A branch node is encoded as $(\texttt{MPT\_NODE\_BRANCH}, v_?, c_1, \dots, c_{16})$. Here $v_?$ consists of a flag indicating whether a value is present,\todo{In the current implementation, we use a length prefix rather than a is-present prefix, but we plan to change that.} followed by the actual value payload if one is present. Each $c_i$ is the encoding of a child node.
-  \item An extension node is encoded as $(\texttt{MPT\_NODE\_EXTENSION}, k, c)$, $k$ represents the part of the key associated with this extension, and is encoded as a 2-tuple $(\texttt{packed\_nibbles}, \texttt{num\_nibbles})$. $c$ is a pointer to a child node.
+  \item A branch node is encoded as $(\texttt{MPT\_NODE\_BRANCH}, v_?, c_1, \dots, c_{16})$. Here $v_?$ consists of a flag indicating whether a value is present, followed by the actual value payload if one is present. Each $c_i$ is the encoding of a child node.
+  \item An extension node is encoded as $(\texttt{MPT\_NODE\_EXTENSION}, k, c)$, where $k$ represents the part of the key associated with this extension, and is encoded as a 2-tuple $(\texttt{packed\_nibbles}, \texttt{num\_nibbles})$. $c$ is a pointer to a child node.
   \item A leaf node is encoded as $(\texttt{MPT\_NODE\_LEAF}, k, v)$, where $k$ is a 2-tuple as above, and $v$ is a value payload.
   \item A digest node is encoded as $(\texttt{MPT\_NODE\_HASH}, d)$, where $d$ is a Keccak256 digest.
 \end{enumerate}
 Nodes are thus given in depth-first order, enabling natural recursive methods for encoding and decoding this format.
+The payload of state and receipt tries is given in the natural sequential way. The transaction an receipt payloads contain variable size data, thus the input is slightly different. The prover input for for the transactions is the transaction RLP encoding preceded by its length. For the receipts is in the natural sequential way, except that topics and data are preceded by their lengths, respectively.
+
+\subsection{Encoding and Hashing}
+
+Encoding is done recursively starting from the trie root. Leaf, branch and extension nodes are encoded as the RLP encoding of list containing the hex prefix encoding of the node key as well as
+
+\begin{description}
+	\item[Leaf Node:] the encoding of the the payload,
+	\item[Branch Node:] the hash or encoding of the 16 children and the encoding of the payload,
+	\item[Extension Node:] the hash or encoding of the child and the encoding of the payload.
+\end{description}
+For the rest of the nodes we have:
+\begin{description}
+	\item[Empty Node:] the encoding of an empty node is {\tt 0x80},
+	\item[Digest Node:] the encoding of a digest node stored as $({\tt MPT\_HASH\_NODE}, d)$ is $d$.
+\end{description}
+
+The payloads in turn are RLP encoded as follows
+\begin{description}
+	\item[State Trie:] Encoded as a list containing nonce, balance, storage trie hash and code hash.
+	\item[Storage Trie:] The RLP encoding of the value (thus the double RLP encoding)
+	\item[Transaction Trie:] The RLP encoded transaction.
+	\item[Receipt Trie:] Depending on the transaction type it's encoded as ${\sf RLP}({\sf RLP}({\tt receipt}))$ for Legacy transactions or ${\sf RLP}({\tt txn\_type}||{\sf RLP}({\tt receipt}))$ for transactions of type 1 or 2. Each receipt is encoded as a list containing:
+	\begin{enumerate}
+		\item the status,
+		\item the cumulative gas used,
+		\item the bloom filter, stored as a list of length 256.
+		\item the list of topics
+		\item the data string.
+	\end{enumerate}
+\end{description}
+
+Once a node is encoded it is written to the {\tt Segment::RlpRaw} segment as a sequence of bytes. Then the RLP encoded data is hashed if the length of the data is more than 32 bytes. Otherwise we return the encoding. Further details can be found in the \href{https://github.com/0xPolygonZero/plonky2/tree/main/evm/src/cpu/mpt/hash}{mpt hash folder}.
\ No newline at end of file
diff --git a/evm/spec/tables.tex b/evm/spec/tables.tex
index 92ee1d2a54..43b45eb584 100644
--- a/evm/spec/tables.tex
+++ b/evm/spec/tables.tex
@@ -3,6 +3,7 @@ \section{Tables}
 
 \input{tables/cpu}
 \input{tables/arithmetic}
+\input{tables/byte-packing}
 \input{tables/logic}
 \input{tables/memory}
 \input{tables/keccak-f}
diff --git a/evm/spec/tables/arithmetic.tex b/evm/spec/tables/arithmetic.tex
index eafed3ba96..19be4638f6 100644
--- a/evm/spec/tables/arithmetic.tex
+++ b/evm/spec/tables/arithmetic.tex
@@ -1,4 +1,54 @@
 \subsection{Arithmetic}
 \label{arithmetic}
 
-TODO
+Each row of the arithmetic table corresponds to a binary or ternary arithmetic operation. Each of these operations has an associated flag $f_{op}$ in the table, such that $f_{\texttt{op}} = 1$ whenever the operation is $\texttt{op}$ and 0 otherwise. The full list of operations carried out by the table is as follows:
+\paragraph*{Binary operations:} \begin{itemize}
+        \item basic operations: ``add'', ``mul'', ``sub'' and ``div'',
+        \item comparisons: ``lt'' and ``gt'',
+        \item shifts: ``shr'' and ``shl'',
+        \item ``byte'': given $x_1, x_2$, returns the $x_1$-th ``byte'' in $x_2$,
+        \item modular operations: ``mod'', ``AddFp254'', ``MulFp254'' and ``SubFp254'',
+        \item range-check: no operation is performed, as this is only used to range-check the input and output limbs in the range [$0, 2^{16} - 1$].
+    \end{itemize}
+For `mod', the second input is the modulus. ``AddFp254'', ``MulFp254'' and ``SubFp254'' are modular operations modulo ``Fp254'` -- the prime for the BN curve's base field.
+
+\paragraph*{Ternary operations:} There are three ternary operations: modular addition ``AddMod'', modular multiplication ``MulMod'' and modular subtraction ``SubMod''.
+
+Besides the flags, the arithmetic table needs to store the inputs, output and some auxiliary values necessary to constraints. The input and output values are range-checked to ensure their canonical representation. Inputs are 256-bits words. To avoid having too large a range-check, inputs are therefore split into sixteen 16-bits limbs, and range-checked in the range $[0, 2^{16}-1]$.
+
+Overall, the table comprises the following columns:
+\begin{itemize}
+    \item 17 columns for the operation flags $f_{op}$,
+    \item 1 column $op$ containing the opcode,
+    \item 16 columns for the 16-bit limbs $x_{0, i}$ of the first input $x_{0}$,
+    \item 16 columns for the 16-bit limbs $x_{1, i}$ of the second input $x_{1}$,
+    \item 16 columns for the 16-bit limbs $x_{2, i}$ of the third input $x_{2}$,
+    \item 16 columns for the 16-bit limbs $r_i$ of the output $r$,
+    \item 32 columns for auxiliary values $\texttt{aux}_i$,
+    \item 1 column $\texttt{range\_counter}$ containing values in the range [$0, 2^{16}-1$], for the range-check,
+    \item 1 column storing the frequency of appearance of each value in the range $[0, 2^{16} - 1]$.
+\end{itemize}
+
+\paragraph{Note on $op$:} The opcode column is only used for range-checks. For optimization purposes, we check all arithmetic operations against the cpu table together. To ensure correctness, we also check that the operation's opcode corresponds to its behavior. But range-check is not associated to a unique operation: any operation in the cpu table might require its values to be checked. Thus, the arithmetic table cannot know its opcode in advance: it needs to store the value provided by the cpu table.
+
+\subsubsection{Auxiliary columns}
+The way auxiliary values are leveraged to efficiently check correctness is not trivial, but it is explained in detail in each dedicated file. Overall, five files explain the implementations of the various checks. Refer to:
+\begin{enumerate}
+    \item ``mul.rs'' for details on multiplications.
+    \item ``addcy.rs'' for details on addition, subtraction, ``lt'' and ``gt''. 
+    \item ``modular.rs'' for details on how modular operations are checked. Note that even though ``div'' and ``mod'' are generated and checked in a separate file, they leverage the logic for modular operations described in ``modular.rs''. 
+    \item ``byte'' for details on how ``byte'' is checked.
+    \item ``shift.rs'' for details on how shifts are checked. 
+\end{enumerate}
+
+\paragraph*{Note on ``lt'' and ``gt'':} For ``lt'' and ``gt'', auxiliary columns hold the difference $d$ between the two inputs $x_1, x_2$. We can then treat them similarly to subtractions by ensuring that $x_1 - x_2 = d$ for ``lt'' and $x_2 - x_1 = d$ for ``gt''. An auxiliary column $cy$ is used for the carry in additions and subtractions. In the comparisons case, it holds the overflow flag. Contrary to subtractions, the output of ``lt'' and ``gt'' operations is not $d$ but $cy$. 
+
+\paragraph*{Note on ``div'':} It might be unclear why ``div'' and ``mod'' are dealt with in the same file. 
+
+Given numerator and denominator $n, d$, we compute, like for other modular operations, the quotient $q$ and remainder $\texttt{rem}$: 
+$$div(x_1, x_2) = q * x_2 + \texttt{rem}$$. 
+We then set the associated auxiliary columns to $\texttt{rem}$ and the output to $q$. 
+
+This is why ``div'' is essentially a modulo operation, and can be addressed in almost the same way as ``mod''. The only difference is that in the ``mod'' case, the output is $\texttt{rem}$ and the auxiliary value is $q$.
+
+\paragraph{Note on shifts:} ``shr'' and ``shl'' are internally constrained as ``div'' and ``mul'' respectively with shifted operands. Indeed, given inputs $s, x$, the output should be $x >> s$ for ``shr'' (resp. $x << s$ for ``shl''). Since shifts are binary operations, we can use the third input columns to store $s_{\texttt{shifted}} = 1 << s$. Then, we can use the ``div'' logic (resp. ``mul'' logic) to ensure that the output is $\frac{x}{s_{\texttt{shifted}}}$ (resp. $x * s_{\texttt{shifted}}$).
\ No newline at end of file
diff --git a/evm/spec/tables/byte-packing.tex b/evm/spec/tables/byte-packing.tex
new file mode 100644
index 0000000000..6305b7226b
--- /dev/null
+++ b/evm/spec/tables/byte-packing.tex
@@ -0,0 +1,59 @@
+\subsection{Byte Packing}
+\label{byte-packing}
+
+The BytePacking STARK module is used for reading and writing non-empty byte sequences of length at most 32 to memory.
+The "packing" term highlights that reading a sequence in memory will pack the bytes into an EVM word (i.e. U256), while
+the "unpacking" operation consists in breaking down an EVM word into its byte sequence and writing it to memory.
+
+This allows faster memory copies between two memory locations, as well as faster memory reset
+(see \href{https://github.com/0xPolygonZero/plonky2/blob/main/evm/src/cpu/kernel/asm/memory/memcpy.asm}{memcpy.asm} and 
+\href{https://github.com/0xPolygonZero/plonky2/blob/main/evm/src/cpu/kernel/asm/memory/memset.asm}{memset.asm} modules).
+
+The `BytePackingStark' table has one row per packing/unpacking operation.
+
+Each row contains the following columns:
+\begin{enumerate}
+    \item 5 columns containing information on the initial memory address from which the sequence starts
+    (namely a flag differentiating read and write operations, address context, segment and offset values, as well as timestamp),
+    \item 32 columns $b_i$ indicating the length of the byte sequence ($b_i = 1$ if the length is $i+1$, and $b_i = 0$ otherwise),
+    \item 32 columns $v_i$ indicating the values of the bytes that have been read or written during a sequence,
+    \item 2 columns $r_i$ needed for range-checking the byte values.
+\end{enumerate}
+
+\paragraph{Notes on columns generation:}
+Whenever a byte unpacking operation is called, the value $\texttt{val}$ is read from the stack, but because the EVM and the STARKs use different endianness, we need to convert $\texttt{val}$ to a little-endian byte sequence. Only then do we resize it to the appropriate length, and prune extra zeros and higher bytes in the process. Finally, we reverse the byte order and write this new sequence into the $v_i$ columns of the table. 
+
+Whenever the operation is a byte packing, the bytes are read one by one from memory and stored in the $v_i$ columns of the BytePackingStark table.
+
+Note that because of the different endianness on the memory and EVM sides, we write bytes starting with the last one.
+
+The $b_i$ columns hold a boolean value. $b_i = 1$ whenever we are currently reading or writing the i-th element in the byte sequence. $b_i = 0$ otherwise.
+
+\paragraph{Cross-table lookups:}
+The read or written bytes need to be checked against both the cpu and the memory tables. Whenever we call $\texttt{MSTORE\_32BYTES}$, $\texttt{MLOAD\_32BYTES}$ or $\texttt{PUSH}$ on the cpu side, we make use of `BytePackingStark' to make sure we are carrying out the correct operation on the correct values. For this, we check that the following values correspond:
+\begin{enumerate}
+    \item the address (comprising the context, the segment, and the virtual address),
+    \item the length of the byte sequence,
+    \item the timestamp,
+    \item the value (either written to or read from the stack)
+\end{enumerate}
+
+The address here corresponds to the address of the first byte.
+
+On the other hand, we need to make sure that the read and write operations correspond to the values read or stored on the memory side. We therefore need a CTL for each byte, checking that the following values are identical in `MemoryStark' and `BytePackingStark':
+\begin{enumerate}
+    \item a flag indicating whether the operation is a read or a write,
+    \item the address (context, segment and virtual address),
+    \item the byte (followed by 0s to make sure the memory address contains a byte and not a U256 word),
+    \item the timestamp
+\end{enumerate}
+
+Note that the virtual address has to be recomputed based on the length of the sequence of bytes. The virtual address for the $i$-th byte is written as:
+$$ \texttt{virt} + \sum_{j=0}^{31} b_j * j - i$$
+where $\sum_{j=0}^{31} b_j * j$ is equal to $\texttt{sequence\_length} - 1$.
+
+\paragraph*{Note on range-check:} Range-checking is necessary whenever we do a memory unpacking operation that will
+write values to memory. These values are constrained by the range-check to be 8-bit values, i.e. fitting between 0 and 255 included.
+While range-checking values read from memory is not necessary, because we use the same $\texttt{byte\_values}$ columns for both read
+and write operations, this extra condition is enforced throughout the whole trace regardless of the operation type.
+
diff --git a/evm/spec/tables/cpu.tex b/evm/spec/tables/cpu.tex
index 76c8be07a8..7bca5a9f5e 100644
--- a/evm/spec/tables/cpu.tex
+++ b/evm/spec/tables/cpu.tex
@@ -1,4 +1,73 @@
 \subsection{CPU}
 \label{cpu}
 
-TODO
+The CPU is the central component of the zkEVM. Like any CPU, it reads instructions, executes them and modifies the state (registers and the memory)
+accordingly. The constraining of some complex instructions (e.g. Keccak hashing) is delegated to other tables.
+This section will only briefly present the CPU and its columns. Details about the CPU logic will be provided later.
+
+\subsubsection{CPU flow}
+
+An execution run can be decomposed into two distinct parts:
+\begin{itemize}
+    \item \textbf{CPU cycles:} The bulk of the execution. In each row, the CPU reads the current code at the program counter (PC) address, and executes it. The current code can be the kernel code,
+or whichever code is being executed in the current context (transaction code or contract code). Executing an instruction consists in modifying the registers, possibly
+performing some memory operations, and updating the PC.
+    \item \textbf{Padding:} At the end of the execution, we need to pad the length of the CPU trace to the next power of two. When the program counter reaches the special halting label
+in the kernel, execution halts. Constraints ensure that every subsequent row is a padding row and that execution cannot resume.
+\end{itemize}
+
+In the CPU cycles phase, the CPU can switch between different contexts, which correspond to the different environments of the possible calls. Context 0 is the kernel itself, which
+handles initialization (input processing, transaction parsing, transaction trie updating...) and termination (receipt creation, final trie checks...) before and after executing the transaction. Subsequent contexts are created when
+executing user code (transaction or contract code). In a non-zero user context, syscalls may be executed, which are specific instructions written in the kernel. They don't change the context
+but change the code context, which is where the instructions are read from.
+
+\subsubsection{CPU columns}
+
+\paragraph*{Registers:} \begin{itemize}
+    \item \texttt{context}: Indicates which context we are in. 0 for the kernel, and a positive integer for every user context. Incremented by 1 at every call.
+    \item \texttt{code\_context}: Indicates in which context the code to execute resides. It's equal to \texttt{context} in user mode, but is always 0 in kernel mode.
+    \item \texttt{program\_counter}: The address of the instruction to be read and executed.
+    \item \texttt{stack\_len}: The current length of the stack.
+    \item \texttt{is\_kernel\_mode}: Boolean indicating whether we are in kernel (i.e. privileged) mode. This means we are executing kernel code, and we have access to
+privileged instructions.
+    \item \texttt{gas}: The current amount of gas used in the current context. It is eventually checked to be below the current gas limit. Must fit in 32 bits.
+    \item \texttt{clock}: Monotonic counter which starts at 0 and is incremented by 1 at each row. Used to enforce correct ordering of memory accesses. 
+    \item \texttt{opcode\_bits}: 8 boolean columns, which are the bit decomposition of the opcode being read at the current PC.
+\end{itemize}
+
+\paragraph*{Operation flags:} Boolean flags. During CPU cycles phase, each row executes a single instruction, which sets one and only one operation flag. No flag is set during
+padding. The decoding constraints ensure that the flag set corresponds to the opcode being read.
+There isn't a 1-to-1 correspondance between instructions and flags. For efficiency, the same flag can be set by different, unrelated instructions (e.g. \texttt{eq\_iszero}, which represents
+the \texttt{EQ} and the \texttt{ISZERO} instructions). When there is a need to differentiate them in constraints, we filter them with their respective opcode: since the first bit of \texttt{EQ}'s opcode
+(resp. \texttt{ISZERO}'s opcode) is 0 (resp. 1), we can filter a constraint for an EQ instruction with \texttt{eq\_iszero * (1 - opcode\_bits[0])}
+(resp. \texttt{eq\_iszero * opcode\_bits[0]}).
+
+\paragraph*{Memory columns:} The CPU interacts with the EVM memory via its memory channels. At each row, a memory channel can execute a write, a read, or be disabled. A full memory channel is composed of:
+\begin{itemize}
+    \item  \texttt{used}: Boolean flag. If it's set to 1, a memory operation is executed in this channel at this row. If it's set to 0, no operation is done but its columns might be reused for other purposes.
+    \item  \texttt{is\_read}: Boolean flag indicating if a memory operation is a read or a write.
+    \item  3 \texttt{address} columns. A memory address is made of three parts: \texttt{context}, \texttt{segment} and \texttt{virtual}.
+    \item  8 \texttt{value} columns. EVM words are 256 bits long, and they are broken down in 8 32-bit limbs.
+\end{itemize}
+The last memory channel is a partial channel: it doesn't have its own \texttt{value} columns and shares them with the first full memory channel. This allows us to save eight columns.
+
+\paragraph*{General columns:} There are 8 shared general columns. Depending on the instruction, they are used differently:
+\begin{itemize}
+    \item  \texttt{Exceptions}: When raising an exception, the first three general columns are the bit decomposition of the exception code.
+They are used to jump to the correct exception handler.
+    \item  \texttt{Logic}: For EQ, and ISZERO operations, it's easy to check that the result is 1 if \texttt{input0} and \texttt{input1} are equal. It's more difficult
+to prove that, if the result is 0, the inputs are actually unequal. To prove it, each general column contains the modular inverse of $(\texttt{input0}_i - \texttt{input1}_i)$
+for each limb $i$ (or 0 if the limbs are equal). Then the quantity $\texttt{general}_i * (\texttt{input0}_i - \texttt{input1}_i)$ will be 1 if and only if $\texttt{general}_i$ is
+indeed the modular inverse, which is only possible if the difference is non-zero.
+    \item  \texttt{Jumps}: For jumps, we use the first two columns: \texttt{should\_jump} and \texttt{cond\_sum\_pinv}. \texttt{should\_jump} conditions whether the EVM should jump: it's
+1 for a JUMP, and $\texttt{condition} \neq 0$ for a JUMPI. To check if the condition is actually non-zero for a JUMPI, \texttt{cond\_sum\_pinv} stores the modular inverse of
+\texttt{condition} (or 0 if it's zero).
+    \item  \texttt{Shift}: For shifts, the logic differs depending on whether the displacement is lower than $2^{32}$, i.e. if it fits in a single value limb.
+To check if this is not the case, we must check that at least one of the seven high limbs is not zero. The general column \texttt{high\_limb\_sum\_inv} holds the modular inverse
+of the sum of the seven high limbs, and is used to check it's non-zero like the previous cases.
+Contrary to the logic operations, we do not need to check limbs individually: each limb has been range-checked to 32 bits, meaning that it's not possible for the sum to
+overflow and be zero if some of the limbs are non-zero.
+    \item  \texttt{Stack}: \texttt{stack\_inv}, \texttt{stack\_inv\_aux} and \texttt{stack\_inv\_aux\_2} are used by popping-only (resp. pushing-only) instructions to check if the stack is empty after (resp. was empty
+before) the instruction. \texttt{stack\_len\_bounds\_ aux} is used to check that the stack doesn't overflow in user mode. We use the last four columns to prevent conflicts with the other general columns.
+See \ref{stackhandling} for more details.
+\end{itemize}
diff --git a/evm/spec/tables/keccak-f.tex b/evm/spec/tables/keccak-f.tex
index 76e9e9f457..7eee4b53fc 100644
--- a/evm/spec/tables/keccak-f.tex
+++ b/evm/spec/tables/keccak-f.tex
@@ -2,3 +2,64 @@ \subsection{Keccak-f}
 \label{keccak-f}
 
 This table computes the Keccak-f[1600] permutation.
+
+\subsubsection{Keccak-f Permutation}
+To explain how this table is structured, we first need to detail how the permutation is computed. \href{https://keccak.team/keccak_specs_summary.html}{This page} gives a pseudo-code for the permutation. Our implementation differs slightly -- but remains equivalent -- for optimization and constraint degree reasons. 
+
+Let:
+\begin{itemize}
+    \item $S$ be the sponge width ($S=25$ in our case)
+    \item $\texttt{NUM\_ROUNDS}$ be the number of Keccak rounds ($\texttt{NUM\_ROUNDS} = 24$)
+    \item $RC$ a vector of round constants of size  $\texttt{NUM\_ROUNDS}$
+    \item $I$ be the input of the permutation, comprised of $S$ 64-bit elements
+\end{itemize}    
+
+The first step is to reshape $I$ into a $5 \times 5$ matrix. We initialize the state $A$ of the sponge with $I$: $$A[x, y] := I[x, y] \text{ }  \forall x, y \in \{0..4\}$$
+
+We store $A$ in the table, and subdivide each 64-bit element into two 32-bit limbs.
+Then, for each round $i$, we proceed as follows:
+\begin{enumerate}
+    \item First, we define $C[x] := \texttt{xor}_{i=0}^4 A[x, i]$. We store $C$ as bits in the table. This is because we need to apply a rotation on its elements' bits and carry out \texttt{ xor } operations in the next step.
+    \item Then, we store a second vector $C'$ in bits, such that: $$C'[x, z] = C[x, z] \texttt{ xor } C[x-1, z] \texttt{ xor } C[x+1, z-1]$$. 
+    \item We then need to store the updated value of $A$: $$A'[x, y] = A[x, y] \texttt{ xor } C[x, y] \texttt{ xor } C'[x, y]$$ Note that this is equivalent to the equation in the official Keccak-f description: $$A'[x, y] = A[x, y] \texttt{ xor } C[x-1, z] \texttt{ xor } C[x+1, z-1]$$.
+    \item The previous three points correspond to the $\theta$ step in Keccak-f. We can now move on to the $\rho$ and $\pi$ steps. These steps are written as: $$B[y, 2\times x + 3 \times y] := \texttt{rot}(A'[x, y], r[x, y])$$ where $\texttt{rot(a, s)}$ is the bitwise cyclic shift operation, and $r$ is the matrix of rotation offsets. We do not need to store $B$: $B$'s bits are only a permutation of $A'$'s bits. 
+    \item The $\chi$ step updates the state once again, and we store the new values: $$A''[x, y] := B[x, y] \texttt{ xor } (\texttt{not }B[x+1, y] \texttt{ and } B[x+2, y])$$ Because of the way we carry out constraints (as explained below), we do not need to store the individual bits for $A''$: we only need the 32-bit limbs.
+    \item The final step, $\iota$, consists in updating the first element of the state as follows: $$A'''[0, 0] = A''[0, 0] \texttt{ xor } RC[i]$$ where $$A'''[x, y] = A''[x, y] \forall (x, y) \neq (0, 0)$$ Since only the first element is updated, we only need to store $A'''[0, 0]$ of this updated state. The remaining elements are fetched from $A''$. However, because of the bitwise $\texttt{xor}$ operation, we do need columns for the bits of $A''[0, 0]$.  
+\end{enumerate}
+
+Note that all permutation elements are 64-bit long. But they are stored as 32-bit limbs so that we do not overflow the field. 
+
+It is also important to note that all bitwise logic operations ($\texttt{ xor }$, $\texttt{ not }$ and $\texttt{ and}$) are checked in this table. This is why we need to store the bits of most elements. The logic table can only carry out eight 32-bit logic operations per row. Thus, leveraging it here would drastically increase the number of logic rows, and incur too much overhead in proving time.
+
+
+
+\subsubsection{Columns}
+Using the notations from the previous section, we can now list the columns in the table:
+\begin{enumerate}
+    \item $\texttt{NUM\_ROUND}S = 24$ columns $c_i$ to determine which round is currently being computed. $c_i = 1$ when we are in the $i$-th round, and 0 otherwise. These columns' purpose is to ensure that the correct round constants are used at each round.
+    \item $1$ column $t$ which stores the timestamp at which the Keccak operation was called in the cpu. This column enables us to ensure that inputs and outputs are consistent between the cpu, keccak-sponge and keccak-f tables.
+    \item $5 \times 5 \times 2 = 50 $columns to store the elements of $A$. As a reminder, each 64-bit element is divided into two 32-bit limbs, and $A$ comprises $S = 25$ elements.
+    \item $5 \times 64 = 320$ columns to store the bits of the vector $C$.
+    \item $5 \times 64 = 320$ columns to store the bits of the vector $C'$.
+    \item $5 \times 5 \times 64 = 1600$ columns to store the bits of $A'$.
+    \item $5 \times 5 \times 2 = 50$ columns to store the 32-bit limbs of $A''$.
+    \item $64$ columns to store the bits of $A''[0, 0]$.
+    \item $2$ columns to store the two limbs of $A'''[0, 0]$.
+\end{enumerate}
+
+In total, this table comprises 2,431 columns.
+
+\subsubsection{Constraints}
+Some constraints checking that the elements are computed correctly are not straightforward. Let us detail them here.
+
+First, it is important to highlight the fact that a $\texttt{xor}$ between two elements is of degree 2. Indeed, for $x \texttt{ xor } y$, the constraint is $x + y - 2 \times x \times y$, which is of degree 2. This implies that a $\texttt{xor}$ between 3 elements is of degree 3, which is the maximal constraint degree for our STARKs.
+
+We can check that $C'[x, z] = C[x, z] \texttt{ xor } C[x - 1, z] \texttt{ xor } C[x + 1, z - 1]$. However, we cannot directly check that $C[x] = \texttt{xor}_{i=0}^4 A[x, i]$, as it would be a degree 5 constraint. Instead, we use $C'$ for this constraint. We see that:
+$$\texttt{xor}_{i=0}^4 A'[x, i, z] = C'[x, z]$$
+This implies that the difference $d = \sum_{i=0}^4 A'[x, i, z] - C'[x, z]$ is either 0, 2 or 4. We can therefore enforce the following degree 3 constraint instead:
+$$d \times (d - 2) \times (d - 4) = 0$$
+
+Additionally, we have to check that $A'$ is well constructed. We know that $A'$ should be such that $A'[x, y, z] = A[x, y, z] \texttt{ xor } C[x, z] \texttt{ xor } C'[x, z]$. Since we do not have the bits of $A$ elements but the bits of $A'$ elements, we check the equivalent degree 3 constraint:
+$$A[x, y, z] = A'[x, y, z] \texttt{ xor } C[x, z] \texttt { xor } C'[x, z]$$
+
+Finally, the constraints for the remaining elements, $A''$ and $A'''$ are straightforward: $A''$ is a three-element bitwise $\texttt{xor}$ where all bits involved are already storedn and $A'''[0, 0]$ is the output of a simple bitwise $\texttt{xor}$ with a round constant.
\ No newline at end of file
diff --git a/evm/spec/tables/keccak-sponge.tex b/evm/spec/tables/keccak-sponge.tex
index 29f71ba1c4..a712335b8f 100644
--- a/evm/spec/tables/keccak-sponge.tex
+++ b/evm/spec/tables/keccak-sponge.tex
@@ -1,4 +1,66 @@
-\subsection{Keccak sponge}
+\subsection{KeccakSponge}
 \label{keccak-sponge}
 
-This table computes the Keccak256 hash, a sponge-based hash built on top of the Keccak-f[1600] permutation.
+This table computes the Keccak256 hash, a sponge-based hash built on top of the Keccak-f[1600] permutation. An instance of KeccakSponge takes as input a Memory address $a$,
+a length $l$, and computes the Keccak256 digest of the memory segment starting at $a$ and of size $l$. An instance can span many rows, each individual row being a single call to
+the Keccak table. Note that all the read elements must be bytes; the proof will be unverifiable if this is not the case. Following the Keccak specifications, the input string is padded to the next multiple of 136 bytes. 
+Each row contains the following columns:
+\begin{itemize}
+    \item Read bytes:
+        \begin{itemize}
+            \item 3 address columns: \texttt{context}, \texttt{segment} and the offset \texttt{virt} of $a$.
+            \item \texttt{timestamp}: the timestamp which will be used for all memory reads of this instance.
+            \item \texttt{already\_absorbed\_bytes}: keeps track of how many bytes have been hashed in the current instance. At the end of an instance, we should have absorbed $l$ bytes in total.
+            \item \texttt{KECCAK\_RATE\_BYTES} \texttt{block\_bytes} columns: the bytes being absorbed at this row. They are read from memory and will be XORed to the rate part of the current state.
+        \end{itemize}
+    \item Input columns:
+    \begin{itemize}
+        \item \texttt{KECCAK\_RATE\_U32S} \texttt{original\_rate\_u32s} columns: hold the rate part of the state before XORing it with \texttt{block\_bytes}. At the beginning of an instance, they are initialized with 0.
+        \item \texttt{KECCAK\_RATE\_U32s} \texttt{xored\_rate\_u32s} columns: hold the original rate XORed with \texttt{block\_bytes}.
+        \item \texttt{KECCAK\_CAPACITY\_U32S} \texttt{original\_capacity\_u32s} columns: hold the capacity part of the state before applying the Keccak permutation.
+    \end{itemize}
+    \item Output columns:
+    \begin{itemize}
+        \item \texttt{KECCAK\_DIGEST\_BYTES} \texttt{updated\_digest\_state\_bytes columns}: the beginning of the output state after applying the Keccak permutation. At the last row of an instance, they hold the computed hash.
+They are decomposed in bytes for endianness reasons.
+        \item \texttt{KECCAK\_WIDTH\_MINUS\_DIGEST\_U32S} \texttt{partial\_updated\_state\_u32s} columns: the rest of the output state. They are discarded for the final digest, but are used between instance rows.
+    \end{itemize}
+    \item Helper columns:
+    \begin{itemize}
+        \item \texttt{is\_full\_input\_block}: indicates if the current row has a full input block, i.e. \texttt{block\_bytes} contains only bytes read from memory and no padding bytes.
+        \item \texttt{KECCAK\_RATE\_BYTES} \texttt{is\_final\_input\_len} columns: in the final row of an instance, indicate where the final read byte is. If the $i$-th column is set to 1, it means that
+all bytes after the $i$-th are padding bytes. In a full input block, all columns are set to 0. 
+    \end{itemize}
+\end{itemize}
+
+For each instance, constraints ensure that:
+\begin{itemize}
+    \item at each row:
+    \begin{itemize}
+        \item \texttt{is\_full\_input\_block} and \texttt{is\_final\_input\_len} columns are all binary.
+        \item Only one column in \texttt{is\_full\_input\_block} and \texttt{is\_final\_input\_len} is set to 1.
+        \item \texttt{xored\_rate\_u32s} is \texttt{original\_rate\_u32s} XOR \texttt{block\_bytes}.
+        \item The CTL with Keccak ensures that (\texttt{updated\_digest\_state\_bytes columns}, \texttt{partial\_updated\_state\_u32s}) is the Keccak permutation output of (\texttt{xored\_rate\_u32s}, \texttt{original\_capacity\_u32s}).
+    \end{itemize}
+    \item at the first row:
+    \begin{itemize}
+        \item \texttt{original\_rate\_u32s} is all 0.
+        \item \texttt{already\_absorbed\_bytes} is 0.
+    \end{itemize}
+    \item at each full input row (i.e. \texttt{is\_full\_input\_block} is 1, all \texttt{is\_final\_input\_len} columns are 0):
+    \begin{itemize}
+        \item \texttt{context}, \texttt{segment}, \texttt{virt} and \texttt{timestamp} are unchanged in the next row.
+        \item Next \texttt{already\_absorbed\_bytes} is current \texttt{already\_absorbed\_bytes} + \texttt{KECCAK\_RATE\_BYTES}.
+        \item Next (\texttt{original\_rate\_u32s}, \texttt{original\_capacity\_u32s}) is current (\texttt{updated\_digest\_state\_bytes columns}, \texttt{partial\_updated\_state\_u32s}).
+        \item The CTL with Memory ensures that \texttt{block\_bytes} is filled with contiguous memory elements [$a$ + \texttt{already\_absorbed\_bytes}, $a$ + \texttt{already\_absorbed\_bytes} + \texttt{KECCAK\_RATE\_BYTES} - 1]
+    \end{itemize}
+    \item at the final row (i.e. \texttt{is\_full\_input\_block} is 0, \texttt{is\_final\_input\_len}'s $i$-th column is 1 for a certain $i$, the rest are 0):
+    \begin{itemize}
+        \item The CTL with Memory ensures that \texttt{block\_bytes} is filled with contiguous memory elements [$a$ + \texttt{already\_absorbed\_bytes}, $a$ + \texttt{already\_absorbed\_bytes} + $i$ - 1]. The rest are padding bytes.
+        \item The CTL with CPU ensures that \texttt{context}, \texttt{segment}, \texttt{virt} and \texttt{timestamp} match the \texttt{KECCAK\_GENERAL} call.
+        \item The CTL with CPU ensures that $l$ = \texttt{already\_absorbed\_bytes} + $i$.
+        \item The CTL with CPU ensures that \texttt{updated\_digest\_state\_bytes} is the output of the \texttt{KECCAK\_GENERAL} call.
+    \end{itemize}
+\end{itemize}
+
+The trace is padded to the next power of two with dummy rows, whose \texttt{is\_full\_input\_block} and \texttt{is\_final\_input\_len} columns are all 0.
diff --git a/evm/spec/tables/logic.tex b/evm/spec/tables/logic.tex
index b430c95dce..e2425fc4a8 100644
--- a/evm/spec/tables/logic.tex
+++ b/evm/spec/tables/logic.tex
@@ -1,4 +1,18 @@
 \subsection{Logic}
 \label{logic}
 
-TODO
+Each row of the logic table corresponds to one bitwise logic operation: either AND, OR or XOR. Each input for these operations is represented as 256 bits, while the output is stored as eight 32-bit limbs. 
+
+Each row therefore contains the following columns:
+\begin{enumerate}
+    \item $f_{\texttt{and}}$, an ``is and'' flag, which should be 1 for an OR operation and 0 otherwise,
+    \item $f_{\texttt{or}}$, an ``is or'' flag, which should be 1 for an OR operation and 0 otherwise,
+    \item $f_{\texttt{xor}}$, an ``is xor'' flag, which should be 1 for a XOR operation and 0 otherwise,
+    \item 256 columns $x_{1, i}$ for the bits of the first input $x_1$,
+    \item 256 columns $x_{2, i}$ for the bits of the second input $x_2$,
+    \item 8 columns $r_i$ for the 32-bit limbs of the output $r$.
+\end{enumerate}
+
+Note that we need all three flags because we need to be able to distinguish between an operation row and a padding row -- where all flags are set to 0.
+
+The subdivision into bits is required for the two inputs as the table carries out bitwise operations. The result, on the other hand, is represented in 32-bit limbs since we do not need individual bits and can therefore save the remaining 248 columns. Moreover, the output is checked against the cpu, which stores values in the same way. 
diff --git a/evm/spec/tables/memory.tex b/evm/spec/tables/memory.tex
index 9653f391b4..d39e99b23d 100644
--- a/evm/spec/tables/memory.tex
+++ b/evm/spec/tables/memory.tex
@@ -11,39 +11,40 @@ \subsection{Memory}
   \item $v$, the value being read or written
   \item $\tau$, the timestamp of the operation
 \end{enumerate}
-The memory table should be ordered by $(a, \tau)$. Note that the correctness memory could be checked as follows:
+The memory table should be ordered by $(a, \tau)$. Note that the correctness of the memory could be checked as follows:
 \begin{enumerate}
-  \item Verify the ordering by checking that $(a_i, \tau_i) < (a_{i+1}, \tau_{i+1})$ for each consecutive pair.
-  \item Enumerate the purportedly-ordered log while tracking a ``current'' value $c$, which is initially zero.\footnote{EVM memory is zero-initialized.}
+  \item Verify the ordering by checking that $(a_i, \tau_i) \leq (a_{i+1}, \tau_{i+1})$ for each consecutive pair.
+  \item Enumerate the purportedly-ordered log while tracking the ``current'' value of $v$. 
   \begin{enumerate}
-    \item Upon observing an address which doesn't match that of the previous row, set $c \leftarrow 0$.
-    \item Upon observing a write, set $c \leftarrow v$.
-    \item Upon observing a read, check that $v = c$.
+    \item Upon observing an address which doesn't match that of the previous row, if the address is zero-initialized
+  and if the operation is a read, check that  $v = 0$.
+    \item Upon observing a write, don't constrain $v$.
+    \item Upon observing a read at timestamp $\tau_i$ which isn't the first operation at this address, check that $v_i = v_{i-1}$.
   \end{enumerate}
 \end{enumerate}
 
-The ordering check is slightly involved since we are comparing multiple columns. To facilitate this, we add an additional column $e$, where the prover can indicate whether two consecutive addresses are equal. An honest prover will set
+The ordering check is slightly involved since we are comparing multiple columns. To facilitate this, we add an additional column $e$, where the prover can indicate whether two consecutive addresses changed. An honest prover will set
 $$
 e_i \leftarrow \begin{cases}
-  1 & \text{if } a_i = a_{i + 1}, \\
+  1 & \text{if } a_i \neq a_{i + 1}, \\
   0 & \text{otherwise}.
 \end{cases}
 $$
+We also introduce a range-check column $c$, which should hold:
+$$
+c_i \leftarrow \begin{cases}
+  a_{i + 1} - a_i - 1 & \text{if } e_i = 1, \\
+  \tau_{i+1} - \tau_i & \text{otherwise}.
+\end{cases}
+$$
+The extra $-1$ ensures that the address actually changed if $e_i = 1$.
 We then impose the following transition constraints:
 \begin{enumerate}
   \item $e_i (e_i - 1) = 0$,
-  \item $e_i (a_i - a_{i + 1}) = 0$,
-  \item $e_i (\tau_{i + 1} - \tau_i) + (1 - e_i) (a_{i + 1} - a_i - 1) < 2^{32}$.
-\end{enumerate}
-The last constraint emulates a comparison between two addresses or timestamps by bounding their difference; this assumes that all addresses and timestamps fit in 32 bits and that the field is larger than that.
-
-Finally, the iterative checks can be arithmetized by introducing a trace column for the current value $c$. We add a boundary constraint $c_0 = 0$, and the following transition constraints:
-\todo{This is out of date, we don't actually need a $c$ column.}
-\begin{enumerate}
-  \item $v_{\text{from},i} = c_i$,
-  \item $c_{i + 1} = e_i v_{\text{to},i}$.
+  \item $(1 - e_i) (a_{i + 1} - a_i) = 0$,
+  \item $c_i < 2^{32}$.
 \end{enumerate}
-
+The third constraint emulates a comparison between two addresses or timestamps by bounding their difference; this assumes that all addresses and timestamps fit in 32 bits and that the field is larger than that.
 
 \subsubsection{Virtual memory}
 
@@ -55,7 +56,32 @@ \subsubsection{Virtual memory}
 \end{enumerate}
 The comparisons now involve several columns, which requires some minor adaptations to the technique described above; we will leave these as an exercise to the reader.
 
+Note that an additional constraint check is required: whenever we change the context or the segment, the virtual address must be range-checked to $2^{32}$.
+Without this check, addresses could start at -1 (i.e. $p - 2$) and then increase properly.
 
 \subsubsection{Timestamps}
 
-TODO: Explain $\tau = \texttt{NUM\_CHANNELS} \times \texttt{cycle} + \texttt{channel}$.
+Memory operations are sorted by address $a$ and timestamp $\tau$. For a memory operation in the CPU, we have:
+$$\tau = \texttt{NUM\_CHANNELS} \times \texttt{cycle} + \texttt{channel}.$$
+Since a memory channel can only hold at most one memory operation, every CPU memory operation's timestamp is unique.
+
+Note that it doesn't mean that all memory operations have unique timestamps. There are two exceptions:
+
+\begin{itemize}
+  \item Before the CPU cycles, we write some global metadata in memory. These extra operations are done at timestamp $\tau = 0$.
+  \item Some tables other than CPU can generate memory operations, like KeccakSponge. When this happens, these operations all have the timestamp of the CPU row of the instruction which invoked the table (for KeccakSponge, KECCAK\_GENERAL).
+\end{itemize}
+
+\subsubsection{Memory initialization}
+
+By default, all memory is zero-initialized. However, to save numerous writes, we allow some specific segments to be initialized with arbitrary values.
+
+\begin{itemize}
+  \item The read-only kernel code (in segment 0, context 0) is initialized with its correct values. It's checked by hashing the segment and verifying
+that the hash value matches a verifier-provided one.
+  \item The code segment (segment 0) in other contexts is initialized with externally-provided account code, then checked against the account code hash.
+If the code is meant to be executed, there is a soundness concern: if the code is malformed and ends with an incomplete PUSH, then the missing bytes must
+be 0 accordingly to the Ethereum specs. To prevent the issue, we manually write 33 zeros (at most 32 bytes for the PUSH argument, and an extra one for
+the post-PUSH PC value).
+  \item The ``TrieData'' segment is initialized with the input tries. The stored tries are hashed and checked against the provided initial hash. Note that the length of the segment and the pointers -- within the ``TrieData'' segment -- for the three tries are provided as prover inputs. The length is then checked against a value computed when hashing the tries.
+\end{itemize}
diff --git a/evm/spec/zkevm.pdf b/evm/spec/zkevm.pdf
index f181eba624..3b10fba30b 100644
Binary files a/evm/spec/zkevm.pdf and b/evm/spec/zkevm.pdf differ
diff --git a/evm/spec/zkevm.tex b/evm/spec/zkevm.tex
index 2927e7a543..ee2c38a54e 100644
--- a/evm/spec/zkevm.tex
+++ b/evm/spec/zkevm.tex
@@ -8,6 +8,7 @@
 \usepackage{makecell}
 \usepackage{mathtools}
 \usepackage{tabularx}
+\usepackage{enumitem}
 \usepackage[textwidth=1.25in]{todonotes}
 
 % Scale for DRAFT watermark.
@@ -52,7 +53,7 @@
 \input{framework}
 \input{tables}
 \input{mpts}
-\input{instructions}
+\input{cpulogic}
 
 \bibliography{bibliography}{}
 \bibliographystyle{ieeetr}
diff --git a/evm/src/all_stark.rs b/evm/src/all_stark.rs
index 079ff114c4..cd7a2d3c38 100644
--- a/evm/src/all_stark.rs
+++ b/evm/src/all_stark.rs
@@ -1,4 +1,4 @@
-use std::iter;
+use core::ops::Deref;
 
 use plonky2::field::extension::Extendable;
 use plonky2::field::types::Field;
@@ -11,7 +11,7 @@ use crate::config::StarkConfig;
 use crate::cpu::cpu_stark;
 use crate::cpu::cpu_stark::CpuStark;
 use crate::cpu::membus::NUM_GP_CHANNELS;
-use crate::cross_table_lookup::{CrossTableLookup, TableWithColumns};
+use crate::cross_table_lookup::{CrossTableLookup, TableIdx, TableWithColumns};
 use crate::keccak::keccak_stark;
 use crate::keccak::keccak_stark::KeccakStark;
 use crate::keccak_sponge::columns::KECCAK_RATE_BYTES;
@@ -23,19 +23,21 @@ use crate::memory::memory_stark;
 use crate::memory::memory_stark::MemoryStark;
 use crate::stark::Stark;
 
+/// Structure containing all STARKs and the cross-table lookups.
 #[derive(Clone)]
 pub struct AllStark<F: RichField + Extendable<D>, const D: usize> {
-    pub arithmetic_stark: ArithmeticStark<F, D>,
-    pub byte_packing_stark: BytePackingStark<F, D>,
-    pub cpu_stark: CpuStark<F, D>,
-    pub keccak_stark: KeccakStark<F, D>,
-    pub keccak_sponge_stark: KeccakSpongeStark<F, D>,
-    pub logic_stark: LogicStark<F, D>,
-    pub memory_stark: MemoryStark<F, D>,
-    pub cross_table_lookups: Vec<CrossTableLookup<F>>,
+    pub(crate) arithmetic_stark: ArithmeticStark<F, D>,
+    pub(crate) byte_packing_stark: BytePackingStark<F, D>,
+    pub(crate) cpu_stark: CpuStark<F, D>,
+    pub(crate) keccak_stark: KeccakStark<F, D>,
+    pub(crate) keccak_sponge_stark: KeccakSpongeStark<F, D>,
+    pub(crate) logic_stark: LogicStark<F, D>,
+    pub(crate) memory_stark: MemoryStark<F, D>,
+    pub(crate) cross_table_lookups: Vec<CrossTableLookup<F>>,
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> Default for AllStark<F, D> {
+    /// Returns an `AllStark` containing all the STARKs initialized with default values.
     fn default() -> Self {
         Self {
             arithmetic_stark: ArithmeticStark::default(),
@@ -64,6 +66,7 @@ impl<F: RichField + Extendable<D>, const D: usize> AllStark<F, D> {
     }
 }
 
+/// Associates STARK tables with a unique index.
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 pub enum Table {
     Arithmetic = 0,
@@ -75,10 +78,22 @@ pub enum Table {
     Memory = 6,
 }
 
+impl Deref for Table {
+    type Target = TableIdx;
+
+    fn deref(&self) -> &Self::Target {
+        // Hacky way to implement `Deref` for `Table` so that we don't have to
+        // call `Table::Foo as usize`, but perhaps too ugly to be worth it.
+        [&0, &1, &2, &3, &4, &5, &6][*self as TableIdx]
+    }
+}
+
+/// Number of STARK tables.
 pub(crate) const NUM_TABLES: usize = Table::Memory as usize + 1;
 
 impl Table {
-    pub(crate) fn all() -> [Self; NUM_TABLES] {
+    /// Returns all STARK table indices.
+    pub(crate) const fn all() -> [Self; NUM_TABLES] {
         [
             Self::Arithmetic,
             Self::BytePacking,
@@ -91,6 +106,7 @@ impl Table {
     }
 }
 
+/// Returns all the `CrossTableLookups` used for proving the EVM.
 pub(crate) fn all_cross_table_lookups<F: Field>() -> Vec<CrossTableLookup<F>> {
     vec![
         ctl_arithmetic(),
@@ -103,6 +119,7 @@ pub(crate) fn all_cross_table_lookups<F: Field>() -> Vec<CrossTableLookup<F>> {
     ]
 }
 
+/// `CrossTableLookup` for `ArithmeticStark`, to connect it with the `Cpu` module.
 fn ctl_arithmetic<F: Field>() -> CrossTableLookup<F> {
     CrossTableLookup::new(
         vec![cpu_stark::ctl_arithmetic_base_rows()],
@@ -110,127 +127,169 @@ fn ctl_arithmetic<F: Field>() -> CrossTableLookup<F> {
     )
 }
 
+/// `CrossTableLookup` for `BytePackingStark`, to connect it with the `Cpu` module.
 fn ctl_byte_packing<F: Field>() -> CrossTableLookup<F> {
     let cpu_packing_looking = TableWithColumns::new(
-        Table::Cpu,
+        *Table::Cpu,
         cpu_stark::ctl_data_byte_packing(),
         Some(cpu_stark::ctl_filter_byte_packing()),
     );
     let cpu_unpacking_looking = TableWithColumns::new(
-        Table::Cpu,
+        *Table::Cpu,
         cpu_stark::ctl_data_byte_unpacking(),
         Some(cpu_stark::ctl_filter_byte_unpacking()),
     );
+    let cpu_push_packing_looking = TableWithColumns::new(
+        *Table::Cpu,
+        cpu_stark::ctl_data_byte_packing_push(),
+        Some(cpu_stark::ctl_filter_byte_packing_push()),
+    );
+    let cpu_jumptable_read_looking = TableWithColumns::new(
+        *Table::Cpu,
+        cpu_stark::ctl_data_jumptable_read(),
+        Some(cpu_stark::ctl_filter_syscall_exceptions()),
+    );
     let byte_packing_looked = TableWithColumns::new(
-        Table::BytePacking,
+        *Table::BytePacking,
         byte_packing_stark::ctl_looked_data(),
         Some(byte_packing_stark::ctl_looked_filter()),
     );
     CrossTableLookup::new(
-        vec![cpu_packing_looking, cpu_unpacking_looking],
+        vec![
+            cpu_packing_looking,
+            cpu_unpacking_looking,
+            cpu_push_packing_looking,
+            cpu_jumptable_read_looking,
+        ],
         byte_packing_looked,
     )
 }
 
-// We now need two different looked tables for `KeccakStark`:
-// one for the inputs and one for the outputs.
-// They are linked with the timestamp.
+/// `CrossTableLookup` for `KeccakStark` inputs, to connect it with the `KeccakSponge` module.
+/// `KeccakStarkSponge` looks into `KeccakStark` to give the inputs of the sponge.
+/// Its consistency with the 'output' CTL is ensured through a timestamp column on the `KeccakStark` side.
 fn ctl_keccak_inputs<F: Field>() -> CrossTableLookup<F> {
     let keccak_sponge_looking = TableWithColumns::new(
-        Table::KeccakSponge,
+        *Table::KeccakSponge,
         keccak_sponge_stark::ctl_looking_keccak_inputs(),
         Some(keccak_sponge_stark::ctl_looking_keccak_filter()),
     );
     let keccak_looked = TableWithColumns::new(
-        Table::Keccak,
+        *Table::Keccak,
         keccak_stark::ctl_data_inputs(),
         Some(keccak_stark::ctl_filter_inputs()),
     );
     CrossTableLookup::new(vec![keccak_sponge_looking], keccak_looked)
 }
 
+/// `CrossTableLookup` for `KeccakStark` outputs, to connect it with the `KeccakSponge` module.
+/// `KeccakStarkSponge` looks into `KeccakStark` to give the outputs of the sponge.
 fn ctl_keccak_outputs<F: Field>() -> CrossTableLookup<F> {
     let keccak_sponge_looking = TableWithColumns::new(
-        Table::KeccakSponge,
+        *Table::KeccakSponge,
         keccak_sponge_stark::ctl_looking_keccak_outputs(),
         Some(keccak_sponge_stark::ctl_looking_keccak_filter()),
     );
     let keccak_looked = TableWithColumns::new(
-        Table::Keccak,
+        *Table::Keccak,
         keccak_stark::ctl_data_outputs(),
         Some(keccak_stark::ctl_filter_outputs()),
     );
     CrossTableLookup::new(vec![keccak_sponge_looking], keccak_looked)
 }
 
+/// `CrossTableLookup` for `KeccakSpongeStark` to connect it with the `Cpu` module.
 fn ctl_keccak_sponge<F: Field>() -> CrossTableLookup<F> {
     let cpu_looking = TableWithColumns::new(
-        Table::Cpu,
+        *Table::Cpu,
         cpu_stark::ctl_data_keccak_sponge(),
         Some(cpu_stark::ctl_filter_keccak_sponge()),
     );
     let keccak_sponge_looked = TableWithColumns::new(
-        Table::KeccakSponge,
+        *Table::KeccakSponge,
         keccak_sponge_stark::ctl_looked_data(),
         Some(keccak_sponge_stark::ctl_looked_filter()),
     );
     CrossTableLookup::new(vec![cpu_looking], keccak_sponge_looked)
 }
 
+/// `CrossTableLookup` for `LogicStark` to connect it with the `Cpu` and `KeccakSponge` modules.
 fn ctl_logic<F: Field>() -> CrossTableLookup<F> {
     let cpu_looking = TableWithColumns::new(
-        Table::Cpu,
+        *Table::Cpu,
         cpu_stark::ctl_data_logic(),
         Some(cpu_stark::ctl_filter_logic()),
     );
     let mut all_lookers = vec![cpu_looking];
     for i in 0..keccak_sponge_stark::num_logic_ctls() {
         let keccak_sponge_looking = TableWithColumns::new(
-            Table::KeccakSponge,
+            *Table::KeccakSponge,
             keccak_sponge_stark::ctl_looking_logic(i),
             Some(keccak_sponge_stark::ctl_looking_logic_filter()),
         );
         all_lookers.push(keccak_sponge_looking);
     }
     let logic_looked =
-        TableWithColumns::new(Table::Logic, logic::ctl_data(), Some(logic::ctl_filter()));
+        TableWithColumns::new(*Table::Logic, logic::ctl_data(), Some(logic::ctl_filter()));
     CrossTableLookup::new(all_lookers, logic_looked)
 }
 
+/// `CrossTableLookup` for `MemoryStark` to connect it with all the modules which need memory accesses.
 fn ctl_memory<F: Field>() -> CrossTableLookup<F> {
     let cpu_memory_code_read = TableWithColumns::new(
-        Table::Cpu,
+        *Table::Cpu,
         cpu_stark::ctl_data_code_memory(),
         Some(cpu_stark::ctl_filter_code_memory()),
     );
     let cpu_memory_gp_ops = (0..NUM_GP_CHANNELS).map(|channel| {
         TableWithColumns::new(
-            Table::Cpu,
+            *Table::Cpu,
             cpu_stark::ctl_data_gp_memory(channel),
             Some(cpu_stark::ctl_filter_gp_memory(channel)),
         )
     });
+    let cpu_push_write_ops = TableWithColumns::new(
+        *Table::Cpu,
+        cpu_stark::ctl_data_partial_memory::<F>(),
+        Some(cpu_stark::ctl_filter_partial_memory()),
+    );
+    let cpu_set_context_write = TableWithColumns::new(
+        *Table::Cpu,
+        cpu_stark::ctl_data_memory_old_sp_write_set_context::<F>(),
+        Some(cpu_stark::ctl_filter_set_context()),
+    );
+    let cpu_set_context_read = TableWithColumns::new(
+        *Table::Cpu,
+        cpu_stark::ctl_data_memory_new_sp_read_set_context::<F>(),
+        Some(cpu_stark::ctl_filter_set_context()),
+    );
     let keccak_sponge_reads = (0..KECCAK_RATE_BYTES).map(|i| {
         TableWithColumns::new(
-            Table::KeccakSponge,
+            *Table::KeccakSponge,
             keccak_sponge_stark::ctl_looking_memory(i),
             Some(keccak_sponge_stark::ctl_looking_memory_filter(i)),
         )
     });
     let byte_packing_ops = (0..32).map(|i| {
         TableWithColumns::new(
-            Table::BytePacking,
+            *Table::BytePacking,
             byte_packing_stark::ctl_looking_memory(i),
             Some(byte_packing_stark::ctl_looking_memory_filter(i)),
         )
     });
-    let all_lookers = iter::once(cpu_memory_code_read)
-        .chain(cpu_memory_gp_ops)
-        .chain(keccak_sponge_reads)
-        .chain(byte_packing_ops)
-        .collect();
+    let all_lookers = vec![
+        cpu_memory_code_read,
+        cpu_push_write_ops,
+        cpu_set_context_write,
+        cpu_set_context_read,
+    ]
+    .into_iter()
+    .chain(cpu_memory_gp_ops)
+    .chain(keccak_sponge_reads)
+    .chain(byte_packing_ops)
+    .collect();
     let memory_looked = TableWithColumns::new(
-        Table::Memory,
+        *Table::Memory,
         memory_stark::ctl_data(),
         Some(memory_stark::ctl_filter()),
     );
diff --git a/evm/src/arithmetic/addcy.rs b/evm/src/arithmetic/addcy.rs
index 3366e432ae..4f343b45d5 100644
--- a/evm/src/arithmetic/addcy.rs
+++ b/evm/src/arithmetic/addcy.rs
@@ -149,7 +149,7 @@ pub(crate) fn eval_packed_generic_addcy<P: PackedField>(
     }
 }
 
-pub fn eval_packed_generic<P: PackedField>(
+pub(crate) fn eval_packed_generic<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
@@ -236,7 +236,7 @@ pub(crate) fn eval_ext_circuit_addcy<F: RichField + Extendable<D>, const D: usiz
     }
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
diff --git a/evm/src/arithmetic/arithmetic_stark.rs b/evm/src/arithmetic/arithmetic_stark.rs
index 3d281c868c..5e3f039cdf 100644
--- a/evm/src/arithmetic/arithmetic_stark.rs
+++ b/evm/src/arithmetic/arithmetic_stark.rs
@@ -1,5 +1,5 @@
-use std::marker::PhantomData;
-use std::ops::Range;
+use core::marker::PhantomData;
+use core::ops::Range;
 
 use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::field::packed::PackedField;
@@ -11,19 +11,19 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::util::transpose;
 use static_assertions::const_assert;
 
-use super::columns::NUM_ARITH_COLUMNS;
+use super::columns::{op_flags, NUM_ARITH_COLUMNS};
 use super::shift;
 use crate::all_stark::Table;
-use crate::arithmetic::columns::{RANGE_COUNTER, RC_FREQUENCIES, SHARED_COLS};
+use crate::arithmetic::columns::{NUM_SHARED_COLS, RANGE_COUNTER, RC_FREQUENCIES, SHARED_COLS};
 use crate::arithmetic::{addcy, byte, columns, divmod, modular, mul, Operation};
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::cross_table_lookup::{Column, TableWithColumns};
+use crate::cross_table_lookup::TableWithColumns;
 use crate::evaluation_frame::{StarkEvaluationFrame, StarkFrame};
-use crate::lookup::Lookup;
+use crate::lookup::{Column, Filter, Lookup};
 use crate::stark::Stark;
 
-/// Link the 16-bit columns of the arithmetic table, split into groups
-/// of N_LIMBS at a time in `regs`, with the corresponding 32-bit
+/// Creates a vector of `Columns` to link the 16-bit columns of the arithmetic table,
+/// split into groups of N_LIMBS at a time in `regs`, with the corresponding 32-bit
 /// columns of the CPU table. Does this for all ops in `ops`.
 ///
 /// This is done by taking pairs of columns (x, y) of the arithmetic
@@ -57,11 +57,18 @@ fn cpu_arith_data_link<F: Field>(
     res
 }
 
-pub fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
+/// Returns the `TableWithColumns` for `ArithmeticStark` rows where one of the arithmetic operations has been called.
+pub(crate) fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
     // We scale each filter flag with the associated opcode value.
     // If an arithmetic operation is happening on the CPU side,
     // the CTL will enforce that the reconstructed opcode value
     // from the opcode bits matches.
+    // These opcodes are missing the syscall and prover_input opcodes,
+    // since `IS_RANGE_CHECK` can be associated to multiple opcodes.
+    // For `IS_RANGE_CHECK`, the opcodes are written in OPCODE_COL,
+    // and we use that column for scaling and the CTL checks.
+    // Note that we ensure in the STARK's constraints that the
+    // value in `OPCODE_COL` is 0 if `IS_RANGE_CHECK` = 0.
     const COMBINED_OPS: [(usize, u8); 16] = [
         (columns::IS_ADD, 0x01),
         (columns::IS_MUL, 0x02),
@@ -88,30 +95,38 @@ pub fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
         columns::OUTPUT_REGISTER,
     ];
 
-    let filter_column = Some(Column::sum(COMBINED_OPS.iter().map(|(c, _v)| *c)));
+    let mut filter_cols = COMBINED_OPS.to_vec();
+    filter_cols.push((columns::IS_RANGE_CHECK, 0x01));
 
+    let filter = Some(Filter::new_simple(Column::sum(
+        filter_cols.iter().map(|(c, _v)| *c),
+    )));
+
+    let mut all_combined_cols = COMBINED_OPS.to_vec();
+    all_combined_cols.push((columns::OPCODE_COL, 0x01));
     // Create the Arithmetic Table whose columns are those of the
     // operations listed in `ops` whose inputs and outputs are given
     // by `regs`, where each element of `regs` is a range of columns
     // corresponding to a 256-bit input or output register (also `ops`
     // is used as the operation filter).
     TableWithColumns::new(
-        Table::Arithmetic,
-        cpu_arith_data_link(&COMBINED_OPS, &REGISTER_MAP),
-        filter_column,
+        *Table::Arithmetic,
+        cpu_arith_data_link(&all_combined_cols, &REGISTER_MAP),
+        filter,
     )
 }
 
+/// Structure representing the `Arithmetic` STARK, which carries out all the arithmetic operations.
 #[derive(Copy, Clone, Default)]
-pub struct ArithmeticStark<F, const D: usize> {
+pub(crate) struct ArithmeticStark<F, const D: usize> {
     pub f: PhantomData<F>,
 }
 
-const RANGE_MAX: usize = 1usize << 16; // Range check strict upper bound
+pub(crate) const RANGE_MAX: usize = 1usize << 16; // Range check strict upper bound
 
 impl<F: RichField, const D: usize> ArithmeticStark<F, D> {
     /// Expects input in *column*-major layout
-    fn generate_range_checks(&self, cols: &mut Vec<Vec<F>>) {
+    fn generate_range_checks(&self, cols: &mut [Vec<F>]) {
         debug_assert!(cols.len() == columns::NUM_ARITH_COLUMNS);
 
         let n_rows = cols[0].len();
@@ -193,6 +208,20 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         let lv: &[P; NUM_ARITH_COLUMNS] = vars.get_local_values().try_into().unwrap();
         let nv: &[P; NUM_ARITH_COLUMNS] = vars.get_next_values().try_into().unwrap();
 
+        // Flags must be boolean.
+        for flag_idx in op_flags() {
+            let flag = lv[flag_idx];
+            yield_constr.constraint(flag * (flag - P::ONES));
+        }
+
+        // Only a single flag must be activated at once.
+        let all_flags = op_flags().map(|i| lv[i]).sum::<P>();
+        yield_constr.constraint(all_flags * (all_flags - P::ONES));
+
+        // Check that `OPCODE_COL` holds 0 if the operation is not a range_check.
+        let opcode_constraint = (P::ONES - lv[columns::IS_RANGE_CHECK]) * lv[columns::OPCODE_COL];
+        yield_constr.constraint(opcode_constraint);
+
         // Check the range column: First value must be 0, last row
         // must be 2^16-1, and intermediate rows must increment by 0
         // or 1.
@@ -204,11 +233,17 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         let range_max = P::Scalar::from_canonical_u64((RANGE_MAX - 1) as u64);
         yield_constr.constraint_last_row(rc1 - range_max);
 
+        // Evaluate constraints for the MUL operation.
         mul::eval_packed_generic(lv, yield_constr);
+        // Evaluate constraints for ADD, SUB, LT and GT operations.
         addcy::eval_packed_generic(lv, yield_constr);
+        // Evaluate constraints for DIV and MOD operations.
         divmod::eval_packed(lv, nv, yield_constr);
+        // Evaluate constraints for ADDMOD, SUBMOD, MULMOD and for FP254 modular operations.
         modular::eval_packed(lv, nv, yield_constr);
+        // Evaluate constraints for the BYTE operation.
         byte::eval_packed(lv, yield_constr);
+        // Evaluate constraints for SHL and SHR operations.
         shift::eval_packed_generic(lv, nv, yield_constr);
     }
 
@@ -223,6 +258,31 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         let nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS] =
             vars.get_next_values().try_into().unwrap();
 
+        // Flags must be boolean.
+        for flag_idx in op_flags() {
+            let flag = lv[flag_idx];
+            let constraint = builder.mul_sub_extension(flag, flag, flag);
+            yield_constr.constraint(builder, constraint);
+        }
+
+        // Only a single flag must be activated at once.
+        let all_flags = builder.add_many_extension(op_flags().map(|i| lv[i]));
+        let constraint = builder.mul_sub_extension(all_flags, all_flags, all_flags);
+        yield_constr.constraint(builder, constraint);
+
+        // Check that `OPCODE_COL` holds 0 if the operation is not a range_check.
+        let opcode_constraint = builder.arithmetic_extension(
+            F::NEG_ONE,
+            F::ONE,
+            lv[columns::IS_RANGE_CHECK],
+            lv[columns::OPCODE_COL],
+            lv[columns::OPCODE_COL],
+        );
+        yield_constr.constraint(builder, opcode_constraint);
+
+        // Check the range column: First value must be 0, last row
+        // must be 2^16-1, and intermediate rows must increment by 0
+        // or 1.
         let rc1 = lv[columns::RANGE_COUNTER];
         let rc2 = nv[columns::RANGE_COUNTER];
         yield_constr.constraint_first_row(builder, rc1);
@@ -234,11 +294,17 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         let t = builder.sub_extension(rc1, range_max);
         yield_constr.constraint_last_row(builder, t);
 
+        // Evaluate constraints for the MUL operation.
         mul::eval_ext_circuit(builder, lv, yield_constr);
+        // Evaluate constraints for ADD, SUB, LT and GT operations.
         addcy::eval_ext_circuit(builder, lv, yield_constr);
+        // Evaluate constraints for DIV and MOD operations.
         divmod::eval_ext_circuit(builder, lv, nv, yield_constr);
+        // Evaluate constraints for ADDMOD, SUBMOD, MULMOD and for FP254 modular operations.
         modular::eval_ext_circuit(builder, lv, nv, yield_constr);
+        // Evaluate constraints for the BYTE operation.
         byte::eval_ext_circuit(builder, lv, yield_constr);
+        // Evaluate constraints for SHL and SHR operations.
         shift::eval_ext_circuit(builder, lv, nv, yield_constr);
     }
 
@@ -246,11 +312,12 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         3
     }
 
-    fn lookups(&self) -> Vec<Lookup> {
+    fn lookups(&self) -> Vec<Lookup<F>> {
         vec![Lookup {
-            columns: SHARED_COLS.collect(),
-            table_column: RANGE_COUNTER,
-            frequencies_column: RC_FREQUENCIES,
+            columns: Column::singles(SHARED_COLS).collect(),
+            table_column: Column::single(RANGE_COUNTER),
+            frequencies_column: Column::single(RC_FREQUENCIES),
+            filter_columns: vec![None; NUM_SHARED_COLS],
         }]
     }
 }
diff --git a/evm/src/arithmetic/byte.rs b/evm/src/arithmetic/byte.rs
index bb8cd12122..f7581efa77 100644
--- a/evm/src/arithmetic/byte.rs
+++ b/evm/src/arithmetic/byte.rs
@@ -60,7 +60,7 @@
 //!    y * 256 ∈ {0, 256, 512, ..., 2^16 - 256}
 //! 8. Hence y ∈ {0, 1, ..., 255}
 
-use std::ops::Range;
+use core::ops::Range;
 
 use ethereum_types::U256;
 use plonky2::field::extension::Extendable;
@@ -197,7 +197,7 @@ pub(crate) fn generate<F: PrimeField64>(lv: &mut [F], idx: U256, val: U256) {
     );
 }
 
-pub fn eval_packed<P: PackedField>(
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
@@ -293,7 +293,7 @@ pub fn eval_packed<P: PackedField>(
     }
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
@@ -306,6 +306,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let idx_decomp = &lv[AUX_INPUT_REGISTER_0];
     let tree = &lv[AUX_INPUT_REGISTER_1];
 
+    // low 5 bits of the first limb of idx:
     let mut idx0_lo5 = builder.zero_extension();
     for i in 0..5 {
         let bit = idx_decomp[i];
@@ -316,6 +317,9 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let scale = builder.constant_extension(scale);
         idx0_lo5 = builder.mul_add_extension(bit, scale, idx0_lo5);
     }
+    // Verify that idx0_hi is the high (11) bits of the first limb of
+    // idx (in particular idx0_hi is at most 11 bits, since idx[0] is
+    // at most 16 bits).
     let t = F::Extension::from(F::from_canonical_u64(32));
     let t = builder.constant_extension(t);
     let t = builder.mul_add_extension(idx_decomp[5], t, idx0_lo5);
@@ -323,6 +327,9 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let t = builder.mul_extension(is_byte, t);
     yield_constr.constraint(builder, t);
 
+    // Verify the layers of the tree
+    // NB: Each of the bit values is negated in place to account for
+    // the reversed indexing.
     let one = builder.one_extension();
     let bit = idx_decomp[4];
     for i in 0..8 {
@@ -362,6 +369,8 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let t = builder.mul_extension(is_byte, t);
     yield_constr.constraint(builder, t);
 
+    // Check byte decomposition of last limb:
+
     let base8 = F::Extension::from(F::from_canonical_u64(1 << 8));
     let base8 = builder.constant_extension(base8);
     let lo_byte = lv[BYTE_LAST_LIMB_LO];
@@ -380,19 +389,29 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     yield_constr.constraint(builder, t);
     let expected_out_byte = tree[15];
 
+    // Sum all higher limbs; sum will be non-zero iff idx >= 32.
     let mut hi_limb_sum = lv[BYTE_IDX_DECOMP_HI];
     for i in 1..N_LIMBS {
         hi_limb_sum = builder.add_extension(hi_limb_sum, idx[i]);
     }
+    // idx_is_large is 0 or 1
     let idx_is_large = lv[BYTE_IDX_IS_LARGE];
     let t = builder.mul_sub_extension(idx_is_large, idx_is_large, idx_is_large);
     let t = builder.mul_extension(is_byte, t);
     yield_constr.constraint(builder, t);
 
+    // If hi_limb_sum is nonzero, then idx_is_large must be one.
     let t = builder.sub_extension(idx_is_large, one);
     let t = builder.mul_many_extension([is_byte, hi_limb_sum, t]);
     yield_constr.constraint(builder, t);
 
+    // If idx_is_large is 1, then hi_limb_sum_inv must be the inverse
+    // of hi_limb_sum, hence hi_limb_sum is non-zero, hence idx is
+    // indeed "large".
+    //
+    // Otherwise, if idx_is_large is 0, then hi_limb_sum * hi_limb_sum_inv
+    // is zero, which is only possible if hi_limb_sum is zero, since
+    // hi_limb_sum_inv is non-zero.
     let base16 = F::from_canonical_u64(1 << 16);
     let hi_limb_sum_inv = builder.mul_const_add_extension(
         base16,
@@ -414,6 +433,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let t = builder.mul_extension(is_byte, check);
     yield_constr.constraint(builder, t);
 
+    // Check that the rest of the output limbs are zero
     for i in 1..N_LIMBS {
         let t = builder.mul_extension(is_byte, out[i]);
         yield_constr.constraint(builder, t);
diff --git a/evm/src/arithmetic/columns.rs b/evm/src/arithmetic/columns.rs
index df2d12476b..e4172bc073 100644
--- a/evm/src/arithmetic/columns.rs
+++ b/evm/src/arithmetic/columns.rs
@@ -1,8 +1,8 @@
 //! Arithmetic unit
 
-use std::ops::Range;
+use core::ops::Range;
 
-pub const LIMB_BITS: usize = 16;
+pub(crate) const LIMB_BITS: usize = 16;
 const EVM_REGISTER_BITS: usize = 256;
 
 /// Return the number of LIMB_BITS limbs that are in an EVM
@@ -20,7 +20,7 @@ const fn n_limbs() -> usize {
 }
 
 /// Number of LIMB_BITS limbs that are in on EVM register-sized number.
-pub const N_LIMBS: usize = n_limbs();
+pub(crate) const N_LIMBS: usize = n_limbs();
 
 pub(crate) const IS_ADD: usize = 0;
 pub(crate) const IS_MUL: usize = IS_ADD + 1;
@@ -38,8 +38,14 @@ pub(crate) const IS_GT: usize = IS_LT + 1;
 pub(crate) const IS_BYTE: usize = IS_GT + 1;
 pub(crate) const IS_SHL: usize = IS_BYTE + 1;
 pub(crate) const IS_SHR: usize = IS_SHL + 1;
+pub(crate) const IS_RANGE_CHECK: usize = IS_SHR + 1;
+/// Column that stores the opcode if the operation is a range check.
+pub(crate) const OPCODE_COL: usize = IS_RANGE_CHECK + 1;
+pub(crate) const START_SHARED_COLS: usize = OPCODE_COL + 1;
 
-pub(crate) const START_SHARED_COLS: usize = IS_SHR + 1;
+pub(crate) const fn op_flags() -> Range<usize> {
+    IS_ADD..IS_RANGE_CHECK + 1
+}
 
 /// Within the Arithmetic Unit, there are shared columns which can be
 /// used by any arithmetic circuit, depending on which one is active
@@ -109,4 +115,5 @@ pub(crate) const RANGE_COUNTER: usize = START_SHARED_COLS + NUM_SHARED_COLS;
 /// The frequencies column used in logUp.
 pub(crate) const RC_FREQUENCIES: usize = RANGE_COUNTER + 1;
 
-pub const NUM_ARITH_COLUMNS: usize = START_SHARED_COLS + NUM_SHARED_COLS + 2;
+/// Number of columns in `ArithmeticStark`.
+pub(crate) const NUM_ARITH_COLUMNS: usize = START_SHARED_COLS + NUM_SHARED_COLS + 2;
diff --git a/evm/src/arithmetic/divmod.rs b/evm/src/arithmetic/divmod.rs
index e143ded6dd..a4599dc721 100644
--- a/evm/src/arithmetic/divmod.rs
+++ b/evm/src/arithmetic/divmod.rs
@@ -1,4 +1,8 @@
-use std::ops::Range;
+//! Support for EVM instructions DIV and MOD.
+//!
+//! The logic for verifying them is detailed in the `modular` submodule.
+
+use core::ops::Range;
 
 use ethereum_types::U256;
 use plonky2::field::extension::Extendable;
diff --git a/evm/src/arithmetic/mod.rs b/evm/src/arithmetic/mod.rs
index 7763e98a06..f9a816c1f8 100644
--- a/evm/src/arithmetic/mod.rs
+++ b/evm/src/arithmetic/mod.rs
@@ -1,6 +1,11 @@
 use ethereum_types::U256;
 use plonky2::field::types::PrimeField64;
 
+use self::columns::{
+    INPUT_REGISTER_0, INPUT_REGISTER_1, INPUT_REGISTER_2, OPCODE_COL, OUTPUT_REGISTER,
+};
+use self::utils::u256_to_array;
+use crate::arithmetic::columns::IS_RANGE_CHECK;
 use crate::extension_tower::BN_BASE;
 use crate::util::{addmod, mulmod, submod};
 
@@ -15,6 +20,9 @@ mod utils;
 pub mod arithmetic_stark;
 pub(crate) mod columns;
 
+/// An enum representing different binary operations.
+///
+/// `Shl` and `Shr` are handled differently, by leveraging `Mul` and `Div` respectively.
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub(crate) enum BinaryOperator {
     Add,
@@ -33,6 +41,7 @@ pub(crate) enum BinaryOperator {
 }
 
 impl BinaryOperator {
+    /// Computes the result of a binary arithmetic operation given two inputs.
     pub(crate) fn result(&self, input0: U256, input1: U256) -> U256 {
         match self {
             BinaryOperator::Add => input0.overflowing_add(input1).0,
@@ -81,7 +90,8 @@ impl BinaryOperator {
         }
     }
 
-    pub(crate) fn row_filter(&self) -> usize {
+    /// Maps a binary arithmetic operation to its associated flag column in the trace.
+    pub(crate) const fn row_filter(&self) -> usize {
         match self {
             BinaryOperator::Add => columns::IS_ADD,
             BinaryOperator::Mul => columns::IS_MUL,
@@ -100,6 +110,7 @@ impl BinaryOperator {
     }
 }
 
+/// An enum representing different ternary operations.
 #[allow(clippy::enum_variant_names)]
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub(crate) enum TernaryOperator {
@@ -109,6 +120,7 @@ pub(crate) enum TernaryOperator {
 }
 
 impl TernaryOperator {
+    /// Computes the result of a ternary arithmetic operation given three inputs.
     pub(crate) fn result(&self, input0: U256, input1: U256, input2: U256) -> U256 {
         match self {
             TernaryOperator::AddMod => addmod(input0, input1, input2),
@@ -117,7 +129,8 @@ impl TernaryOperator {
         }
     }
 
-    pub(crate) fn row_filter(&self) -> usize {
+    /// Maps a ternary arithmetic operation to its associated flag column in the trace.
+    pub(crate) const fn row_filter(&self) -> usize {
         match self {
             TernaryOperator::AddMod => columns::IS_ADDMOD,
             TernaryOperator::MulMod => columns::IS_MULMOD,
@@ -127,6 +140,7 @@ impl TernaryOperator {
 }
 
 /// An enum representing arithmetic operations that can be either binary or ternary.
+#[allow(clippy::enum_variant_names)]
 #[derive(Debug)]
 pub(crate) enum Operation {
     BinaryOperation {
@@ -142,10 +156,17 @@ pub(crate) enum Operation {
         input2: U256,
         result: U256,
     },
+    RangeCheckOperation {
+        input0: U256,
+        input1: U256,
+        input2: U256,
+        opcode: U256,
+        result: U256,
+    },
 }
 
 impl Operation {
-    /// Create a binary operator with given inputs.
+    /// Creates a binary operator with given inputs.
     ///
     /// NB: This works as you would expect, EXCEPT for SHL and SHR,
     /// whose inputs need a small amount of preprocessing. Specifically,
@@ -170,6 +191,7 @@ impl Operation {
         }
     }
 
+    /// Creates a ternary operator with given inputs.
     pub(crate) fn ternary(
         operator: TernaryOperator,
         input0: U256,
@@ -186,10 +208,28 @@ impl Operation {
         }
     }
 
+    pub(crate) const fn range_check(
+        input0: U256,
+        input1: U256,
+        input2: U256,
+        opcode: U256,
+        result: U256,
+    ) -> Self {
+        Self::RangeCheckOperation {
+            input0,
+            input1,
+            input2,
+            opcode,
+            result,
+        }
+    }
+
+    /// Gets the result of an arithmetic operation.
     pub(crate) fn result(&self) -> U256 {
         match self {
             Operation::BinaryOperation { result, .. } => *result,
             Operation::TernaryOperation { result, .. } => *result,
+            _ => panic!("This function should not be called for range checks."),
         }
     }
 
@@ -218,10 +258,18 @@ impl Operation {
                 input2,
                 result,
             } => ternary_op_to_rows(operator.row_filter(), input0, input1, input2, result),
+            Operation::RangeCheckOperation {
+                input0,
+                input1,
+                input2,
+                opcode,
+                result,
+            } => range_check_to_rows(input0, input1, input2, opcode, result),
         }
     }
 }
 
+/// Converts a ternary arithmetic operation to one or two rows of the `ArithmeticStark` table.
 fn ternary_op_to_rows<F: PrimeField64>(
     row_filter: usize,
     input0: U256,
@@ -239,6 +287,7 @@ fn ternary_op_to_rows<F: PrimeField64>(
     (row1, Some(row2))
 }
 
+/// Converts a binary arithmetic operation to one or two rows of the `ArithmeticStark` table.
 fn binary_op_to_rows<F: PrimeField64>(
     op: BinaryOperator,
     input0: U256,
@@ -281,3 +330,21 @@ fn binary_op_to_rows<F: PrimeField64>(
         }
     }
 }
+
+fn range_check_to_rows<F: PrimeField64>(
+    input0: U256,
+    input1: U256,
+    input2: U256,
+    opcode: U256,
+    result: U256,
+) -> (Vec<F>, Option<Vec<F>>) {
+    let mut row = vec![F::ZERO; columns::NUM_ARITH_COLUMNS];
+    row[IS_RANGE_CHECK] = F::ONE;
+    row[OPCODE_COL] = F::from_canonical_u64(opcode.as_u64());
+    u256_to_array(&mut row[INPUT_REGISTER_0], input0);
+    u256_to_array(&mut row[INPUT_REGISTER_1], input1);
+    u256_to_array(&mut row[INPUT_REGISTER_2], input2);
+    u256_to_array(&mut row[OUTPUT_REGISTER], result);
+
+    (row, None)
+}
diff --git a/evm/src/arithmetic/modular.rs b/evm/src/arithmetic/modular.rs
index 4e6e21a632..5a1df5c733 100644
--- a/evm/src/arithmetic/modular.rs
+++ b/evm/src/arithmetic/modular.rs
@@ -1,5 +1,5 @@
-//! Support for the EVM modular instructions ADDMOD, MULMOD and MOD,
-//! as well as DIV.
+//! Support for the EVM modular instructions ADDMOD, SUBMOD, MULMOD and MOD,
+//! as well as DIV and FP254 related modular instructions.
 //!
 //! This crate verifies an EVM modular instruction, which takes three
 //! 256-bit inputs A, B and M, and produces a 256-bit output C satisfying
@@ -108,7 +108,7 @@
 //!   only require 96 columns, or 80 if the output doesn't need to be
 //!   reduced.
 
-use std::ops::Range;
+use core::ops::Range;
 
 use ethereum_types::U256;
 use num::bigint::Sign;
@@ -478,7 +478,7 @@ pub(crate) fn modular_constr_poly<P: PackedField>(
     let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
     let offset = P::Scalar::from_canonical_u64(AUX_COEFF_ABS_MAX as u64);
 
-    // constr_poly = c(x) + q(x) * m(x) + (x - β) * s(x)
+    // constr_poly = c(x) + q(x) * m(x) + (x - β) * s(x)c
     let mut aux = [P::ZEROS; 2 * N_LIMBS];
     for (c, i) in aux.iter_mut().zip(MODULAR_AUX_INPUT_LO) {
         // MODULAR_AUX_INPUT elements were offset by 2^20 in
@@ -625,10 +625,13 @@ pub(crate) fn modular_constr_poly_ext_circuit<F: RichField + Extendable<D>, cons
 ) -> [ExtensionTarget<D>; 2 * N_LIMBS] {
     let mod_is_zero = nv[MODULAR_MOD_IS_ZERO];
 
+    // Check that mod_is_zero is zero or one
     let t = builder.mul_sub_extension(mod_is_zero, mod_is_zero, mod_is_zero);
     let t = builder.mul_extension(filter, t);
     yield_constr.constraint_transition(builder, t);
 
+    // Check that mod_is_zero is zero if modulus is not zero (they
+    // could both be zero)
     let limb_sum = builder.add_many_extension(modulus);
     let t = builder.mul_extension(limb_sum, mod_is_zero);
     let t = builder.mul_extension(filter, t);
@@ -636,13 +639,19 @@ pub(crate) fn modular_constr_poly_ext_circuit<F: RichField + Extendable<D>, cons
 
     modulus[0] = builder.add_extension(modulus[0], mod_is_zero);
 
+    // Is 1 iff the operation is DIV or SHR and the denominator is zero.
     let div_denom_is_zero = nv[MODULAR_DIV_DENOM_IS_ZERO];
     let div_shr_filter = builder.add_extension(lv[IS_DIV], lv[IS_SHR]);
     let t = builder.mul_sub_extension(mod_is_zero, div_shr_filter, div_denom_is_zero);
     let t = builder.mul_extension(filter, t);
     yield_constr.constraint_transition(builder, t);
+
+    // Needed to compensate for adding mod_is_zero to modulus above,
+    // since the call eval_packed_generic_addcy() below subtracts modulus
+    // to verify in the case of a DIV or SHR.
     output[0] = builder.add_extension(output[0], div_denom_is_zero);
 
+    // Verify that the output is reduced, i.e. output < modulus.
     let out_aux_red = &nv[MODULAR_OUT_AUX_RED];
     let one = builder.one_extension();
     let zero = builder.zero_extension();
@@ -660,24 +669,31 @@ pub(crate) fn modular_constr_poly_ext_circuit<F: RichField + Extendable<D>, cons
         &is_less_than,
         true,
     );
+    // restore output[0]
     output[0] = builder.sub_extension(output[0], div_denom_is_zero);
 
+    // prod = q(x) * m(x)
     let prod = pol_mul_wide2_ext_circuit(builder, quot, modulus);
+    // higher order terms must be zero
     for &x in prod[2 * N_LIMBS..].iter() {
         let t = builder.mul_extension(filter, x);
         yield_constr.constraint_transition(builder, t);
     }
 
+    // constr_poly = c(x) + q(x) * m(x)
     let mut constr_poly: [_; 2 * N_LIMBS] = prod[0..2 * N_LIMBS].try_into().unwrap();
     pol_add_assign_ext_circuit(builder, &mut constr_poly, &output);
 
     let offset =
         builder.constant_extension(F::Extension::from_canonical_u64(AUX_COEFF_ABS_MAX as u64));
     let zero = builder.zero_extension();
+
+    // constr_poly = c(x) + q(x) * m(x)
     let mut aux = [zero; 2 * N_LIMBS];
     for (c, i) in aux.iter_mut().zip(MODULAR_AUX_INPUT_LO) {
         *c = builder.sub_extension(nv[i], offset);
     }
+    // add high 16-bits of aux input
     let base = F::from_canonical_u64(1u64 << LIMB_BITS);
     for (c, j) in aux.iter_mut().zip(MODULAR_AUX_INPUT_HI) {
         *c = builder.mul_const_add_extension(base, nv[j], *c);
@@ -700,10 +716,13 @@ pub(crate) fn submod_constr_poly_ext_circuit<F: RichField + Extendable<D>, const
     modulus: [ExtensionTarget<D>; N_LIMBS],
     mut quot: [ExtensionTarget<D>; 2 * N_LIMBS],
 ) -> [ExtensionTarget<D>; 2 * N_LIMBS] {
+    // quot was offset by 2^16 - 1 if it was negative; we undo that
+    // offset here:
     let (lo, hi) = quot.split_at_mut(N_LIMBS);
     let sign = hi[0];
     let t = builder.mul_sub_extension(sign, sign, sign);
     let t = builder.mul_extension(filter, t);
+    // sign must be 1 (negative) or 0 (positive)
     yield_constr.constraint(builder, t);
     let offset = F::from_canonical_u16(u16::max_value());
     for c in lo {
@@ -712,6 +731,7 @@ pub(crate) fn submod_constr_poly_ext_circuit<F: RichField + Extendable<D>, const
     }
     hi[0] = builder.zero_extension();
     for d in hi {
+        // All higher limbs must be zero
         let t = builder.mul_extension(filter, *d);
         yield_constr.constraint(builder, t);
     }
@@ -737,8 +757,12 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         bn254_filter,
     ]);
 
+    // Ensure that this operation is not the last row of the table;
+    // needed because we access the next row of the table in nv.
     yield_constr.constraint_last_row(builder, filter);
 
+    // Verify that the modulus is the BN254 modulus for the
+    // {ADD,MUL,SUB}FP254 operations.
     let modulus = read_value::<N_LIMBS, _>(lv, MODULAR_MODULUS);
     for (&mi, bi) in modulus.iter().zip(bn254_modulus_limbs()) {
         // bn254_filter * (mi - bi)
@@ -760,6 +784,7 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let mul_filter = builder.add_extension(lv[columns::IS_MULMOD], lv[columns::IS_MULFP254]);
     let addmul_filter = builder.add_extension(add_filter, mul_filter);
 
+    // constr_poly has 2*N_LIMBS limbs
     let submod_constr_poly = submod_constr_poly_ext_circuit(
         lv,
         nv,
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index c09c39d8dc..01c9d5c1c0 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -107,7 +107,7 @@ pub(crate) fn generate_mul<F: PrimeField64>(lv: &mut [F], left_in: [i64; 16], ri
         .copy_from_slice(&aux_limbs.map(|c| F::from_canonical_u16((c >> 16) as u16)));
 }
 
-pub fn generate<F: PrimeField64>(lv: &mut [F], left_in: U256, right_in: U256) {
+pub(crate) fn generate<F: PrimeField64>(lv: &mut [F], left_in: U256, right_in: U256) {
     // TODO: It would probably be clearer/cleaner to read the U256
     // into an [i64;N] and then copy that to the lv table.
     u256_to_array(&mut lv[INPUT_REGISTER_0], left_in);
@@ -173,7 +173,7 @@ pub(crate) fn eval_packed_generic_mul<P: PackedField>(
     }
 }
 
-pub fn eval_packed_generic<P: PackedField>(
+pub(crate) fn eval_packed_generic<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
@@ -195,6 +195,8 @@ pub(crate) fn eval_ext_mul_circuit<F: RichField + Extendable<D>, const D: usize>
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
 
     let aux_limbs = {
+        // MUL_AUX_INPUT was offset by 2^20 in generation, so we undo
+        // that here
         let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << LIMB_BITS));
         let offset =
             builder.constant_extension(F::Extension::from_canonical_u64(AUX_COEFF_ABS_MAX as u64));
@@ -211,17 +213,22 @@ pub(crate) fn eval_ext_mul_circuit<F: RichField + Extendable<D>, const D: usize>
     let mut constr_poly = pol_mul_lo_ext_circuit(builder, left_in_limbs, right_in_limbs);
     pol_sub_assign_ext_circuit(builder, &mut constr_poly, &output_limbs);
 
+    // This subtracts (x - β) * s(x) from constr_poly.
     let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << LIMB_BITS));
     let rhs = pol_adjoin_root_ext_circuit(builder, aux_limbs, base);
     pol_sub_assign_ext_circuit(builder, &mut constr_poly, &rhs);
 
+    // At this point constr_poly holds the coefficients of the
+    // polynomial a(x)b(x) - c(x) - (x - β)*s(x). The
+    // multiplication is valid if and only if all of those
+    // coefficients are zero.
     for &c in &constr_poly {
         let filter = builder.mul_extension(filter, c);
         yield_constr.constraint(builder, filter);
     }
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
diff --git a/evm/src/arithmetic/shift.rs b/evm/src/arithmetic/shift.rs
index 6600c01e54..bb83798495 100644
--- a/evm/src/arithmetic/shift.rs
+++ b/evm/src/arithmetic/shift.rs
@@ -38,7 +38,7 @@ use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer
 /// NB: if `shift >= 256`, then the third register holds 0.
 /// We leverage the functions in mul.rs and divmod.rs to carry out
 /// the computation.
-pub fn generate<F: PrimeField64>(
+pub(crate) fn generate<F: PrimeField64>(
     lv: &mut [F],
     nv: &mut [F],
     is_shl: bool,
@@ -117,7 +117,7 @@ fn eval_packed_shr<P: PackedField>(
     );
 }
 
-pub fn eval_packed_generic<P: PackedField>(
+pub(crate) fn eval_packed_generic<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     nv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
@@ -168,7 +168,7 @@ fn eval_ext_circuit_shr<F: RichField + Extendable<D>, const D: usize>(
     );
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
diff --git a/evm/src/arithmetic/utils.rs b/evm/src/arithmetic/utils.rs
index 6ea375fef3..7350dd3263 100644
--- a/evm/src/arithmetic/utils.rs
+++ b/evm/src/arithmetic/utils.rs
@@ -1,4 +1,4 @@
-use std::ops::{Add, AddAssign, Mul, Neg, Range, Shr, Sub, SubAssign};
+use core::ops::{Add, AddAssign, Mul, Neg, Range, Shr, Sub, SubAssign};
 
 use ethereum_types::U256;
 use plonky2::field::extension::Extendable;
@@ -319,6 +319,7 @@ pub(crate) fn read_value_i64_limbs<const N: usize, F: PrimeField64>(
 }
 
 #[inline]
+/// Turn a 64-bit integer into 4 16-bit limbs and convert them to field elements.
 fn u64_to_array<F: Field>(out: &mut [F], x: u64) {
     const_assert!(LIMB_BITS == 16);
     debug_assert!(out.len() == 4);
@@ -329,6 +330,7 @@ fn u64_to_array<F: Field>(out: &mut [F], x: u64) {
     out[3] = F::from_canonical_u16((x >> 48) as u16);
 }
 
+/// Turn a 256-bit integer into 16 16-bit limbs and convert them to field elements.
 // TODO: Refactor/replace u256_limbs in evm/src/util.rs
 pub(crate) fn u256_to_array<F: Field>(out: &mut [F], x: U256) {
     const_assert!(N_LIMBS == 16);
diff --git a/evm/src/byte_packing/byte_packing_stark.rs b/evm/src/byte_packing/byte_packing_stark.rs
index c28b055a81..ff7a18c06d 100644
--- a/evm/src/byte_packing/byte_packing_stark.rs
+++ b/evm/src/byte_packing/byte_packing_stark.rs
@@ -1,28 +1,22 @@
 //! This crate enforces the correctness of reading and writing sequences
 //! of bytes in Big-Endian ordering from and to the memory.
 //!
-//! The trace layout consists in N consecutive rows for an `N` byte sequence,
-//! with the byte values being cumulatively written to the trace as they are
-//! being processed.
+//! The trace layout consists in one row for an `N` byte sequence (where 32 ≥ `N` > 0).
 //!
-//! At row `i` of such a group (starting from 0), the `i`-th byte flag will be activated
-//! (to indicate which byte we are going to be processing), but all bytes with index
-//! 0 to `i` may have non-zero values, as they have already been processed.
+//! At each row the `i`-th byte flag will be activated to indicate a sequence of
+//! length i+1.
 //!
-//! The length of a sequence is stored within each group of rows corresponding to that
-//! sequence in a dedicated `SEQUENCE_LEN` column. At any row `i`, the remaining length
-//! of the sequence being processed is retrieved from that column and the active byte flag
-//! as:
+//! The length of a sequence can be retrieved for CTLs as:
 //!
-//!    remaining_length = sequence_length - \sum_{i=0}^31 b[i] * i
+//!    sequence_length = \sum_{i=0}^31 b[i] * (i + 1)
 //!
 //! where b[i] is the `i`-th byte flag.
 //!
 //! Because of the discrepancy in endianness between the different tables, the byte sequences
 //! are actually written in the trace in reverse order from the order they are provided.
-//! As such, the memory virtual address for a group of rows corresponding to a sequence starts
-//! with the final virtual address, corresponding to the final byte being read/written, and
-//! is being decremented at each step.
+//! We only store the virtual address `virt` of the first byte, and the virtual address for byte `i`
+//! can be recovered as:
+//!     virt_i = virt + sequence_length - 1 - i
 //!
 //! Note that, when writing a sequence of bytes to memory, both the `U256` value and the
 //! corresponding sequence length are being read from the stack. Because of the endianness
@@ -31,7 +25,7 @@
 //! This means that the higher-order bytes will be thrown away during the process, if the value
 //! is greater than 256^length, and as a result a different value will be stored in memory.
 
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use itertools::Itertools;
 use plonky2::field::extension::{Extendable, FieldExtension};
@@ -46,19 +40,20 @@ use plonky2::util::transpose;
 
 use super::NUM_BYTES;
 use crate::byte_packing::columns::{
-    index_bytes, value_bytes, ADDR_CONTEXT, ADDR_SEGMENT, ADDR_VIRTUAL, BYTE_INDICES_COLS, IS_READ,
-    NUM_COLUMNS, RANGE_COUNTER, RC_FREQUENCIES, SEQUENCE_END, TIMESTAMP,
+    index_len, value_bytes, ADDR_CONTEXT, ADDR_SEGMENT, ADDR_VIRTUAL, IS_READ, LEN_INDICES_COLS,
+    NUM_COLUMNS, RANGE_COUNTER, RC_FREQUENCIES, TIMESTAMP,
 };
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::cross_table_lookup::Column;
 use crate::evaluation_frame::{StarkEvaluationFrame, StarkFrame};
-use crate::lookup::Lookup;
+use crate::lookup::{Column, Filter, Lookup};
 use crate::stark::Stark;
 use crate::witness::memory::MemoryAddress;
 
 /// Strict upper bound for the individual bytes range-check.
 const BYTE_RANGE_MAX: usize = 1usize << 8;
 
+/// Creates the vector of `Columns` for `BytePackingStark` corresponding to the final packed limbs being read/written.
+/// `CpuStark` will look into these columns, as the CPU needs the output of byte packing.
 pub(crate) fn ctl_looked_data<F: Field>() -> Vec<Column<F>> {
     // Reconstruct the u32 limbs composing the final `U256` word
     // being read/written from the underlying byte values. For each,
@@ -66,37 +61,49 @@ pub(crate) fn ctl_looked_data<F: Field>() -> Vec<Column<F>> {
     // obtain the corresponding limb.
     let outputs: Vec<Column<F>> = (0..8)
         .map(|i| {
-            let range = (value_bytes(i * 4)..value_bytes(i * 4) + 4).collect_vec();
+            let range = value_bytes(i * 4)..value_bytes(i * 4) + 4;
             Column::linear_combination(
                 range
-                    .iter()
                     .enumerate()
-                    .map(|(j, &c)| (c, F::from_canonical_u64(1 << (8 * j)))),
+                    .map(|(j, c)| (c, F::from_canonical_u64(1 << (8 * j)))),
             )
         })
         .collect();
 
-    // This will correspond to the actual sequence length when the `SEQUENCE_END` flag is on.
     let sequence_len: Column<F> = Column::linear_combination(
-        (0..NUM_BYTES).map(|i| (index_bytes(i), F::from_canonical_usize(i + 1))),
+        (0..NUM_BYTES).map(|i| (index_len(i), F::from_canonical_usize(i + 1))),
     );
 
-    Column::singles([ADDR_CONTEXT, ADDR_SEGMENT, ADDR_VIRTUAL])
+    Column::singles([IS_READ, ADDR_CONTEXT, ADDR_SEGMENT, ADDR_VIRTUAL])
         .chain([sequence_len])
         .chain(Column::singles(&[TIMESTAMP]))
         .chain(outputs)
         .collect()
 }
 
-pub fn ctl_looked_filter<F: Field>() -> Column<F> {
+/// CTL filter for the `BytePackingStark` looked table.
+pub(crate) fn ctl_looked_filter<F: Field>() -> Filter<F> {
     // The CPU table is only interested in our sequence end rows,
     // since those contain the final limbs of our packed int.
-    Column::single(SEQUENCE_END)
+    Filter::new_simple(Column::sum((0..NUM_BYTES).map(index_len)))
 }
 
+/// Column linear combination for the `BytePackingStark` table reading/writing the `i`th byte sequence from `MemoryStark`.
 pub(crate) fn ctl_looking_memory<F: Field>(i: usize) -> Vec<Column<F>> {
-    let mut res =
-        Column::singles([IS_READ, ADDR_CONTEXT, ADDR_SEGMENT, ADDR_VIRTUAL]).collect_vec();
+    let mut res = Column::singles([IS_READ, ADDR_CONTEXT, ADDR_SEGMENT]).collect_vec();
+
+    // Compute the virtual address: `ADDR_VIRTUAL` + `sequence_len` - 1 - i.
+    let sequence_len_minus_one = (0..NUM_BYTES)
+        .map(|j| (index_len(j), F::from_canonical_usize(j)))
+        .collect::<Vec<_>>();
+    let mut addr_virt_cols = vec![(ADDR_VIRTUAL, F::ONE)];
+    addr_virt_cols.extend(sequence_len_minus_one);
+    let addr_virt = Column::linear_combination_with_constant(
+        addr_virt_cols,
+        F::NEG_ONE * F::from_canonical_usize(i),
+    );
+
+    res.push(addr_virt);
 
     // The i'th input byte being read/written.
     res.push(Column::single(value_bytes(i)));
@@ -110,8 +117,8 @@ pub(crate) fn ctl_looking_memory<F: Field>(i: usize) -> Vec<Column<F>> {
 }
 
 /// CTL filter for reading/writing the `i`th byte of the byte sequence from/to memory.
-pub(crate) fn ctl_looking_memory_filter<F: Field>(i: usize) -> Column<F> {
-    Column::single(index_bytes(i))
+pub(crate) fn ctl_looking_memory_filter<F: Field>(i: usize) -> Filter<F> {
+    Filter::new_simple(Column::sum((i..NUM_BYTES).map(index_len)))
 }
 
 /// Information about a byte packing operation needed for witness generation.
@@ -132,7 +139,7 @@ pub(crate) struct BytePackingOp {
 }
 
 #[derive(Copy, Clone, Default)]
-pub struct BytePackingStark<F, const D: usize> {
+pub(crate) struct BytePackingStark<F, const D: usize> {
     pub(crate) f: PhantomData<F>,
 }
 
@@ -162,12 +169,14 @@ impl<F: RichField + Extendable<D>, const D: usize> BytePackingStark<F, D> {
         ops: Vec<BytePackingOp>,
         min_rows: usize,
     ) -> Vec<[F; NUM_COLUMNS]> {
-        let base_len: usize = ops.iter().map(|op| op.bytes.len()).sum();
+        let base_len: usize = ops.iter().map(|op| usize::from(!op.bytes.is_empty())).sum();
         let num_rows = core::cmp::max(base_len.max(BYTE_RANGE_MAX), min_rows).next_power_of_two();
         let mut rows = Vec::with_capacity(num_rows);
 
         for op in ops {
-            rows.extend(self.generate_rows_for_op(op));
+            if !op.bytes.is_empty() {
+                rows.push(self.generate_row_for_op(op));
+            }
         }
 
         for _ in rows.len()..num_rows {
@@ -177,7 +186,7 @@ impl<F: RichField + Extendable<D>, const D: usize> BytePackingStark<F, D> {
         rows
     }
 
-    fn generate_rows_for_op(&self, op: BytePackingOp) -> Vec<[F; NUM_COLUMNS]> {
+    fn generate_row_for_op(&self, op: BytePackingOp) -> [F; NUM_COLUMNS] {
         let BytePackingOp {
             is_read,
             base_address,
@@ -191,40 +200,32 @@ impl<F: RichField + Extendable<D>, const D: usize> BytePackingStark<F, D> {
             virt,
         } = base_address;
 
-        let mut rows = Vec::with_capacity(bytes.len());
         let mut row = [F::ZERO; NUM_COLUMNS];
         row[IS_READ] = F::from_bool(is_read);
 
         row[ADDR_CONTEXT] = F::from_canonical_usize(context);
         row[ADDR_SEGMENT] = F::from_canonical_usize(segment);
-        // Because of the endianness, we start by the final virtual address value
-        // and decrement it at each step. Similarly, we process the byte sequence
-        // in reverse order.
-        row[ADDR_VIRTUAL] = F::from_canonical_usize(virt + bytes.len() - 1);
+        // We store the initial virtual segment. But the CTLs,
+        // we start with virt + sequence_len - 1.
+        row[ADDR_VIRTUAL] = F::from_canonical_usize(virt);
 
         row[TIMESTAMP] = F::from_canonical_usize(timestamp);
 
+        row[index_len(bytes.len() - 1)] = F::ONE;
+
         for (i, &byte) in bytes.iter().rev().enumerate() {
-            if i == bytes.len() - 1 {
-                row[SEQUENCE_END] = F::ONE;
-            }
             row[value_bytes(i)] = F::from_canonical_u8(byte);
-            row[index_bytes(i)] = F::ONE;
-
-            rows.push(row);
-            row[index_bytes(i)] = F::ZERO;
-            row[ADDR_VIRTUAL] -= F::ONE;
         }
 
-        rows
+        row
     }
 
-    fn generate_padding_row(&self) -> [F; NUM_COLUMNS] {
+    const fn generate_padding_row(&self) -> [F; NUM_COLUMNS] {
         [F::ZERO; NUM_COLUMNS]
     }
 
     /// Expects input in *column*-major layout
-    fn generate_range_checks(&self, cols: &mut Vec<Vec<F>>) {
+    fn generate_range_checks(&self, cols: &mut [Vec<F>]) {
         debug_assert!(cols.len() == NUM_COLUMNS);
 
         let n_rows = cols[0].len();
@@ -254,37 +255,6 @@ impl<F: RichField + Extendable<D>, const D: usize> BytePackingStark<F, D> {
             }
         }
     }
-
-    /// There is only one `i` for which `local_values[index_bytes(i)]` is non-zero,
-    /// and `i+1` is the current position:
-    fn get_active_position<FE, P, const D2: usize>(&self, row: &[P; NUM_COLUMNS]) -> P
-    where
-        FE: FieldExtension<D2, BaseField = F>,
-        P: PackedField<Scalar = FE>,
-    {
-        (0..NUM_BYTES)
-            .map(|i| row[index_bytes(i)] * P::Scalar::from_canonical_usize(i + 1))
-            .sum()
-    }
-
-    /// Recursive version of `get_active_position`.
-    fn get_active_position_circuit(
-        &self,
-        builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
-        row: &[ExtensionTarget<D>; NUM_COLUMNS],
-    ) -> ExtensionTarget<D> {
-        let mut current_position = row[index_bytes(0)];
-
-        for i in 1..NUM_BYTES {
-            current_position = builder.mul_const_add_extension(
-                F::from_canonical_usize(i + 1),
-                row[index_bytes(i)],
-                current_position,
-            );
-        }
-
-        current_position
-    }
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for BytePackingStark<F, D> {
@@ -306,11 +276,22 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for BytePackingSt
         let local_values: &[P; NUM_COLUMNS] = vars.get_local_values().try_into().unwrap();
         let next_values: &[P; NUM_COLUMNS] = vars.get_next_values().try_into().unwrap();
 
+        // Check the range column: First value must be 0, last row
+        // must be 255, and intermediate rows must increment by 0
+        // or 1.
+        let rc1 = local_values[RANGE_COUNTER];
+        let rc2 = next_values[RANGE_COUNTER];
+        yield_constr.constraint_first_row(rc1);
+        let incr = rc2 - rc1;
+        yield_constr.constraint_transition(incr * incr - incr);
+        let range_max = P::Scalar::from_canonical_u64((BYTE_RANGE_MAX - 1) as u64);
+        yield_constr.constraint_last_row(rc1 - range_max);
+
         let one = P::ONES;
 
         // We filter active columns by summing all the byte indices.
         // Constraining each of them to be boolean is done later on below.
-        let current_filter = local_values[BYTE_INDICES_COLS].iter().copied().sum::<P>();
+        let current_filter = local_values[LEN_INDICES_COLS].iter().copied().sum::<P>();
         yield_constr.constraint(current_filter * (current_filter - one));
 
         // The filter column must start by one.
@@ -322,86 +303,20 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for BytePackingSt
 
         // Each byte index must be boolean.
         for i in 0..NUM_BYTES {
-            let idx_i = local_values[index_bytes(i)];
+            let idx_i = local_values[index_len(i)];
             yield_constr.constraint(idx_i * (idx_i - one));
         }
 
-        // The sequence start flag column must start by one.
-        let current_sequence_start = local_values[index_bytes(0)];
-        yield_constr.constraint_first_row(current_sequence_start - one);
-
-        // The sequence end flag must be boolean
-        let current_sequence_end = local_values[SEQUENCE_END];
-        yield_constr.constraint(current_sequence_end * (current_sequence_end - one));
-
-        // If filter is off, all flags and byte indices must be off.
-        let byte_indices = local_values[BYTE_INDICES_COLS].iter().copied().sum::<P>();
-        yield_constr.constraint(
-            (current_filter - one) * (current_is_read + current_sequence_end + byte_indices),
-        );
-
         // Only padding rows have their filter turned off.
-        let next_filter = next_values[BYTE_INDICES_COLS].iter().copied().sum::<P>();
+        let next_filter = next_values[LEN_INDICES_COLS].iter().copied().sum::<P>();
         yield_constr.constraint_transition(next_filter * (next_filter - current_filter));
 
-        // Unless the current sequence end flag is activated, the is_read filter must remain unchanged.
-        let next_is_read = next_values[IS_READ];
-        yield_constr
-            .constraint_transition((current_sequence_end - one) * (next_is_read - current_is_read));
-
-        // If the sequence end flag is activated, the next row must be a new sequence or filter must be off.
-        let next_sequence_start = next_values[index_bytes(0)];
-        yield_constr.constraint_transition(
-            current_sequence_end * next_filter * (next_sequence_start - one),
-        );
-
-        // The active position in a byte sequence must increase by one on every row
-        // or be one on the next row (i.e. at the start of a new sequence).
-        let current_position = self.get_active_position(local_values);
-        let next_position = self.get_active_position(next_values);
-        yield_constr.constraint_transition(
-            next_filter * (next_position - one) * (next_position - current_position - one),
-        );
-
-        // The last row must be the end of a sequence or a padding row.
-        yield_constr.constraint_last_row(current_filter * (current_sequence_end - one));
-
-        // If the next position is one in an active row, the current end flag must be one.
-        yield_constr
-            .constraint_transition(next_filter * current_sequence_end * (next_position - one));
-
-        // The context, segment and timestamp fields must remain unchanged throughout a byte sequence.
-        // The virtual address must decrement by one at each step of a sequence.
-        let current_context = local_values[ADDR_CONTEXT];
-        let next_context = next_values[ADDR_CONTEXT];
-        let current_segment = local_values[ADDR_SEGMENT];
-        let next_segment = next_values[ADDR_SEGMENT];
-        let current_virtual = local_values[ADDR_VIRTUAL];
-        let next_virtual = next_values[ADDR_VIRTUAL];
-        let current_timestamp = local_values[TIMESTAMP];
-        let next_timestamp = next_values[TIMESTAMP];
-        yield_constr.constraint_transition(
-            next_filter * (next_sequence_start - one) * (next_context - current_context),
-        );
-        yield_constr.constraint_transition(
-            next_filter * (next_sequence_start - one) * (next_segment - current_segment),
-        );
-        yield_constr.constraint_transition(
-            next_filter * (next_sequence_start - one) * (next_timestamp - current_timestamp),
-        );
-        yield_constr.constraint_transition(
-            next_filter * (next_sequence_start - one) * (current_virtual - next_virtual - one),
-        );
-
-        // If not at the end of a sequence, each next byte must equal the current one
-        // when reading through the sequence, or the next byte index must be one.
-        for i in 0..NUM_BYTES {
-            let current_byte = local_values[value_bytes(i)];
-            let next_byte = next_values[value_bytes(i)];
-            let next_byte_index = next_values[index_bytes(i)];
-            yield_constr.constraint_transition(
-                (current_sequence_end - one) * (next_byte_index - one) * (next_byte - current_byte),
-            );
+        // Check that all limbs after final length are 0.
+        for i in 0..NUM_BYTES - 1 {
+            // If the length is i+1, then value_bytes(i+1),...,value_bytes(NUM_BYTES-1) must be 0.
+            for j in i + 1..NUM_BYTES {
+                yield_constr.constraint(local_values[index_len(i)] * local_values[value_bytes(j)]);
+            }
         }
     }
 
@@ -416,9 +331,23 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for BytePackingSt
         let next_values: &[ExtensionTarget<D>; NUM_COLUMNS] =
             vars.get_next_values().try_into().unwrap();
 
+        // Check the range column: First value must be 0, last row
+        // must be 255, and intermediate rows must increment by 0
+        // or 1.
+        let rc1 = local_values[RANGE_COUNTER];
+        let rc2 = next_values[RANGE_COUNTER];
+        yield_constr.constraint_first_row(builder, rc1);
+        let incr = builder.sub_extension(rc2, rc1);
+        let t = builder.mul_sub_extension(incr, incr, incr);
+        yield_constr.constraint_transition(builder, t);
+        let range_max =
+            builder.constant_extension(F::Extension::from_canonical_usize(BYTE_RANGE_MAX - 1));
+        let t = builder.sub_extension(rc1, range_max);
+        yield_constr.constraint_last_row(builder, t);
+
         // We filter active columns by summing all the byte indices.
         // Constraining each of them to be boolean is done later on below.
-        let current_filter = builder.add_many_extension(&local_values[BYTE_INDICES_COLS]);
+        let current_filter = builder.add_many_extension(&local_values[LEN_INDICES_COLS]);
         let constraint = builder.mul_sub_extension(current_filter, current_filter, current_filter);
         yield_constr.constraint(builder, constraint);
 
@@ -434,119 +363,25 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for BytePackingSt
 
         // Each byte index must be boolean.
         for i in 0..NUM_BYTES {
-            let idx_i = local_values[index_bytes(i)];
+            let idx_i = local_values[index_len(i)];
             let constraint = builder.mul_sub_extension(idx_i, idx_i, idx_i);
             yield_constr.constraint(builder, constraint);
         }
 
-        // The sequence start flag column must start by one.
-        let current_sequence_start = local_values[index_bytes(0)];
-        let constraint = builder.add_const_extension(current_sequence_start, F::NEG_ONE);
-        yield_constr.constraint_first_row(builder, constraint);
-
-        // The sequence end flag must be boolean
-        let current_sequence_end = local_values[SEQUENCE_END];
-        let constraint = builder.mul_sub_extension(
-            current_sequence_end,
-            current_sequence_end,
-            current_sequence_end,
-        );
-        yield_constr.constraint(builder, constraint);
-
-        // If filter is off, all flags and byte indices must be off.
-        let byte_indices = builder.add_many_extension(&local_values[BYTE_INDICES_COLS]);
-        let constraint = builder.add_extension(current_sequence_end, byte_indices);
-        let constraint = builder.add_extension(constraint, current_is_read);
-        let constraint = builder.mul_sub_extension(constraint, current_filter, constraint);
-        yield_constr.constraint(builder, constraint);
-
         // Only padding rows have their filter turned off.
-        let next_filter = builder.add_many_extension(&next_values[BYTE_INDICES_COLS]);
+        let next_filter = builder.add_many_extension(&next_values[LEN_INDICES_COLS]);
         let constraint = builder.sub_extension(next_filter, current_filter);
         let constraint = builder.mul_extension(next_filter, constraint);
         yield_constr.constraint_transition(builder, constraint);
 
-        // Unless the current sequence end flag is activated, the is_read filter must remain unchanged.
-        let next_is_read = next_values[IS_READ];
-        let diff_is_read = builder.sub_extension(next_is_read, current_is_read);
-        let constraint =
-            builder.mul_sub_extension(diff_is_read, current_sequence_end, diff_is_read);
-        yield_constr.constraint_transition(builder, constraint);
-
-        // If the sequence end flag is activated, the next row must be a new sequence or filter must be off.
-        let next_sequence_start = next_values[index_bytes(0)];
-        let constraint = builder.mul_sub_extension(
-            current_sequence_end,
-            next_sequence_start,
-            current_sequence_end,
-        );
-        let constraint = builder.mul_extension(next_filter, constraint);
-        yield_constr.constraint_transition(builder, constraint);
-
-        // The active position in a byte sequence must increase by one on every row
-        // or be one on the next row (i.e. at the start of a new sequence).
-        let current_position = self.get_active_position_circuit(builder, local_values);
-        let next_position = self.get_active_position_circuit(builder, next_values);
-
-        let position_diff = builder.sub_extension(next_position, current_position);
-        let is_new_or_inactive = builder.mul_sub_extension(next_filter, next_position, next_filter);
-        let constraint =
-            builder.mul_sub_extension(is_new_or_inactive, position_diff, is_new_or_inactive);
-        yield_constr.constraint_transition(builder, constraint);
-
-        // The last row must be the end of a sequence or a padding row.
-        let constraint =
-            builder.mul_sub_extension(current_filter, current_sequence_end, current_filter);
-        yield_constr.constraint_last_row(builder, constraint);
-
-        // If the next position is one in an active row, the current end flag must be one.
-        let constraint = builder.mul_extension(next_filter, current_sequence_end);
-        let constraint = builder.mul_sub_extension(constraint, next_position, constraint);
-        yield_constr.constraint_transition(builder, constraint);
-
-        // The context, segment and timestamp fields must remain unchanged throughout a byte sequence.
-        // The virtual address must decrement by one at each step of a sequence.
-        let current_context = local_values[ADDR_CONTEXT];
-        let next_context = next_values[ADDR_CONTEXT];
-        let current_segment = local_values[ADDR_SEGMENT];
-        let next_segment = next_values[ADDR_SEGMENT];
-        let current_virtual = local_values[ADDR_VIRTUAL];
-        let next_virtual = next_values[ADDR_VIRTUAL];
-        let current_timestamp = local_values[TIMESTAMP];
-        let next_timestamp = next_values[TIMESTAMP];
-        let addr_filter = builder.mul_sub_extension(next_filter, next_sequence_start, next_filter);
-        {
-            let constraint = builder.sub_extension(next_context, current_context);
-            let constraint = builder.mul_extension(addr_filter, constraint);
-            yield_constr.constraint_transition(builder, constraint);
-        }
-        {
-            let constraint = builder.sub_extension(next_segment, current_segment);
-            let constraint = builder.mul_extension(addr_filter, constraint);
-            yield_constr.constraint_transition(builder, constraint);
-        }
-        {
-            let constraint = builder.sub_extension(next_timestamp, current_timestamp);
-            let constraint = builder.mul_extension(addr_filter, constraint);
-            yield_constr.constraint_transition(builder, constraint);
-        }
-        {
-            let constraint = builder.sub_extension(current_virtual, next_virtual);
-            let constraint = builder.mul_sub_extension(addr_filter, constraint, addr_filter);
-            yield_constr.constraint_transition(builder, constraint);
-        }
-
-        // If not at the end of a sequence, each next byte must equal the current one
-        // when reading through the sequence, or the next byte index must be one.
-        for i in 0..NUM_BYTES {
-            let current_byte = local_values[value_bytes(i)];
-            let next_byte = next_values[value_bytes(i)];
-            let next_byte_index = next_values[index_bytes(i)];
-            let byte_diff = builder.sub_extension(next_byte, current_byte);
-            let constraint = builder.mul_sub_extension(byte_diff, next_byte_index, byte_diff);
-            let constraint =
-                builder.mul_sub_extension(constraint, current_sequence_end, constraint);
-            yield_constr.constraint_transition(builder, constraint);
+        // Check that all limbs after final length are 0.
+        for i in 0..NUM_BYTES - 1 {
+            // If the length is i+1, then value_bytes(i+1),...,value_bytes(NUM_BYTES-1) must be 0.
+            for j in i + 1..NUM_BYTES {
+                let constr =
+                    builder.mul_extension(local_values[index_len(i)], local_values[value_bytes(j)]);
+                yield_constr.constraint(builder, constr);
+            }
         }
     }
 
@@ -554,11 +389,12 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for BytePackingSt
         3
     }
 
-    fn lookups(&self) -> Vec<Lookup> {
+    fn lookups(&self) -> Vec<Lookup<F>> {
         vec![Lookup {
-            columns: (value_bytes(0)..value_bytes(0) + NUM_BYTES).collect(),
-            table_column: RANGE_COUNTER,
-            frequencies_column: RC_FREQUENCIES,
+            columns: Column::singles(value_bytes(0)..value_bytes(0) + NUM_BYTES).collect(),
+            table_column: Column::single(RANGE_COUNTER),
+            frequencies_column: Column::single(RC_FREQUENCIES),
+            filter_columns: vec![None; NUM_BYTES],
         }]
     }
 }
diff --git a/evm/src/byte_packing/columns.rs b/evm/src/byte_packing/columns.rs
index 4eff0df8f5..cbed53de1d 100644
--- a/evm/src/byte_packing/columns.rs
+++ b/evm/src/byte_packing/columns.rs
@@ -6,28 +6,28 @@ use crate::byte_packing::NUM_BYTES;
 
 /// 1 if this is a READ operation, and 0 if this is a WRITE operation.
 pub(crate) const IS_READ: usize = 0;
-/// 1 if this is the end of a sequence of bytes.
-/// This is also used as filter for the CTL.
-pub(crate) const SEQUENCE_END: usize = IS_READ + 1;
 
-pub(super) const BYTES_INDICES_START: usize = SEQUENCE_END + 1;
-pub(crate) const fn index_bytes(i: usize) -> usize {
+pub(super) const LEN_INDICES_START: usize = IS_READ + 1;
+// There are `NUM_BYTES` columns used to represent the length of
+// the input byte sequence for a (un)packing operation.
+// index_len(i) is 1 iff the length is i+1.
+pub(crate) const fn index_len(i: usize) -> usize {
     debug_assert!(i < NUM_BYTES);
-    BYTES_INDICES_START + i
+    LEN_INDICES_START + i
 }
 
-// Note: Those are used as filter for distinguishing active vs padding rows,
-// and also to obtain the length of a sequence of bytes being processed.
-pub(crate) const BYTE_INDICES_COLS: Range<usize> =
-    BYTES_INDICES_START..BYTES_INDICES_START + NUM_BYTES;
+// Note: Those are used to obtain the length of a sequence of bytes being processed.
+pub(crate) const LEN_INDICES_COLS: Range<usize> = LEN_INDICES_START..LEN_INDICES_START + NUM_BYTES;
 
-pub(crate) const ADDR_CONTEXT: usize = BYTES_INDICES_START + NUM_BYTES;
+pub(crate) const ADDR_CONTEXT: usize = LEN_INDICES_START + NUM_BYTES;
 pub(crate) const ADDR_SEGMENT: usize = ADDR_CONTEXT + 1;
 pub(crate) const ADDR_VIRTUAL: usize = ADDR_SEGMENT + 1;
 pub(crate) const TIMESTAMP: usize = ADDR_VIRTUAL + 1;
 
 // 32 byte limbs hold a total of 256 bits.
 const BYTES_VALUES_START: usize = TIMESTAMP + 1;
+// There are `NUM_BYTES` columns used to store the values of the bytes
+// that are being read/written for an (un)packing operation.
 pub(crate) const fn value_bytes(i: usize) -> usize {
     debug_assert!(i < NUM_BYTES);
     BYTES_VALUES_START + i
@@ -38,4 +38,5 @@ pub(crate) const RANGE_COUNTER: usize = BYTES_VALUES_START + NUM_BYTES;
 /// The frequencies column used in logUp.
 pub(crate) const RC_FREQUENCIES: usize = RANGE_COUNTER + 1;
 
+/// Number of columns in `BytePackingStark`.
 pub(crate) const NUM_COLUMNS: usize = RANGE_COUNTER + 2;
diff --git a/evm/src/byte_packing/mod.rs b/evm/src/byte_packing/mod.rs
index 7cc93374ca..3767b21ed6 100644
--- a/evm/src/byte_packing/mod.rs
+++ b/evm/src/byte_packing/mod.rs
@@ -6,4 +6,5 @@
 pub mod byte_packing_stark;
 pub mod columns;
 
+/// Maximum number of bytes being processed by a byte (un)packing operation.
 pub(crate) const NUM_BYTES: usize = 32;
diff --git a/evm/src/config.rs b/evm/src/config.rs
index a593c827c2..3f88d99f5d 100644
--- a/evm/src/config.rs
+++ b/evm/src/config.rs
@@ -1,20 +1,29 @@
 use plonky2::fri::reduction_strategies::FriReductionStrategy;
 use plonky2::fri::{FriConfig, FriParams};
 
+/// A configuration containing the different parameters to be used by the STARK prover.
 pub struct StarkConfig {
+    /// The targeted security level for the proofs generated with this configuration.
     pub security_bits: usize,
 
     /// The number of challenge points to generate, for IOPs that have soundness errors of (roughly)
     /// `degree / |F|`.
     pub num_challenges: usize,
 
+    /// The configuration of the FRI sub-protocol.
     pub fri_config: FriConfig,
 }
 
+impl Default for StarkConfig {
+    fn default() -> Self {
+        Self::standard_fast_config()
+    }
+}
+
 impl StarkConfig {
     /// A typical configuration with a rate of 2, resulting in fast but large proofs.
     /// Targets ~100 bit conjectured security.
-    pub fn standard_fast_config() -> Self {
+    pub const fn standard_fast_config() -> Self {
         Self {
             security_bits: 100,
             num_challenges: 2,
diff --git a/evm/src/constraint_consumer.rs b/evm/src/constraint_consumer.rs
index 49dc018ce3..919b51638a 100644
--- a/evm/src/constraint_consumer.rs
+++ b/evm/src/constraint_consumer.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use plonky2::field::extension::Extendable;
 use plonky2::field::packed::PackedField;
@@ -29,7 +29,7 @@ pub struct ConstraintConsumer<P: PackedField> {
 }
 
 impl<P: PackedField> ConstraintConsumer<P> {
-    pub fn new(
+    pub(crate) fn new(
         alphas: Vec<P::Scalar>,
         z_last: P,
         lagrange_basis_first: P,
@@ -44,17 +44,17 @@ impl<P: PackedField> ConstraintConsumer<P> {
         }
     }
 
-    pub fn accumulators(self) -> Vec<P> {
+    pub(crate) fn accumulators(self) -> Vec<P> {
         self.constraint_accs
     }
 
     /// Add one constraint valid on all rows except the last.
-    pub fn constraint_transition(&mut self, constraint: P) {
+    pub(crate) fn constraint_transition(&mut self, constraint: P) {
         self.constraint(constraint * self.z_last);
     }
 
     /// Add one constraint on all rows.
-    pub fn constraint(&mut self, constraint: P) {
+    pub(crate) fn constraint(&mut self, constraint: P) {
         for (&alpha, acc) in self.alphas.iter().zip(&mut self.constraint_accs) {
             *acc *= alpha;
             *acc += constraint;
@@ -63,13 +63,13 @@ impl<P: PackedField> ConstraintConsumer<P> {
 
     /// Add one constraint, but first multiply it by a filter such that it will only apply to the
     /// first row of the trace.
-    pub fn constraint_first_row(&mut self, constraint: P) {
+    pub(crate) fn constraint_first_row(&mut self, constraint: P) {
         self.constraint(constraint * self.lagrange_basis_first);
     }
 
     /// Add one constraint, but first multiply it by a filter such that it will only apply to the
     /// last row of the trace.
-    pub fn constraint_last_row(&mut self, constraint: P) {
+    pub(crate) fn constraint_last_row(&mut self, constraint: P) {
         self.constraint(constraint * self.lagrange_basis_last);
     }
 }
@@ -96,7 +96,7 @@ pub struct RecursiveConstraintConsumer<F: RichField + Extendable<D>, const D: us
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F, D> {
-    pub fn new(
+    pub(crate) fn new(
         zero: ExtensionTarget<D>,
         alphas: Vec<Target>,
         z_last: ExtensionTarget<D>,
@@ -113,12 +113,12 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
         }
     }
 
-    pub fn accumulators(self) -> Vec<ExtensionTarget<D>> {
+    pub(crate) fn accumulators(self) -> Vec<ExtensionTarget<D>> {
         self.constraint_accs
     }
 
     /// Add one constraint valid on all rows except the last.
-    pub fn constraint_transition(
+    pub(crate) fn constraint_transition(
         &mut self,
         builder: &mut CircuitBuilder<F, D>,
         constraint: ExtensionTarget<D>,
@@ -128,7 +128,7 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
     }
 
     /// Add one constraint valid on all rows.
-    pub fn constraint(
+    pub(crate) fn constraint(
         &mut self,
         builder: &mut CircuitBuilder<F, D>,
         constraint: ExtensionTarget<D>,
@@ -140,7 +140,7 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
 
     /// Add one constraint, but first multiply it by a filter such that it will only apply to the
     /// first row of the trace.
-    pub fn constraint_first_row(
+    pub(crate) fn constraint_first_row(
         &mut self,
         builder: &mut CircuitBuilder<F, D>,
         constraint: ExtensionTarget<D>,
@@ -151,7 +151,7 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
 
     /// Add one constraint, but first multiply it by a filter such that it will only apply to the
     /// last row of the trace.
-    pub fn constraint_last_row(
+    pub(crate) fn constraint_last_row(
         &mut self,
         builder: &mut CircuitBuilder<F, D>,
         constraint: ExtensionTarget<D>,
diff --git a/evm/src/cpu/bootstrap_kernel.rs b/evm/src/cpu/bootstrap_kernel.rs
deleted file mode 100644
index 759c852aae..0000000000
--- a/evm/src/cpu/bootstrap_kernel.rs
+++ /dev/null
@@ -1,161 +0,0 @@
-//! The initial phase of execution, where the kernel code is hashed while being written to memory.
-//! The hash is then checked against a precomputed kernel hash.
-
-use itertools::Itertools;
-use plonky2::field::extension::Extendable;
-use plonky2::field::packed::PackedField;
-use plonky2::field::types::Field;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
-use plonky2::plonk::circuit_builder::CircuitBuilder;
-
-use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::cpu::columns::CpuColumnsView;
-use crate::cpu::kernel::aggregator::KERNEL;
-use crate::cpu::membus::NUM_GP_CHANNELS;
-use crate::generation::state::GenerationState;
-use crate::memory::segments::Segment;
-use crate::witness::memory::MemoryAddress;
-use crate::witness::util::{keccak_sponge_log, mem_write_gp_log_and_fill};
-
-pub(crate) fn generate_bootstrap_kernel<F: Field>(state: &mut GenerationState<F>) {
-    // Iterate through chunks of the code, such that we can write one chunk to memory per row.
-    for chunk in &KERNEL.code.iter().enumerate().chunks(NUM_GP_CHANNELS) {
-        let mut cpu_row = CpuColumnsView::default();
-        cpu_row.clock = F::from_canonical_usize(state.traces.clock());
-        cpu_row.is_bootstrap_kernel = F::ONE;
-
-        // Write this chunk to memory, while simultaneously packing its bytes into a u32 word.
-        for (channel, (addr, &byte)) in chunk.enumerate() {
-            let address = MemoryAddress::new(0, Segment::Code, addr);
-            let write =
-                mem_write_gp_log_and_fill(channel, address, state, &mut cpu_row, byte.into());
-            state.traces.push_memory(write);
-        }
-
-        state.traces.push_cpu(cpu_row);
-    }
-
-    let mut final_cpu_row = CpuColumnsView::default();
-    final_cpu_row.clock = F::from_canonical_usize(state.traces.clock());
-    final_cpu_row.is_bootstrap_kernel = F::ONE;
-    final_cpu_row.is_keccak_sponge = F::ONE;
-    // The Keccak sponge CTL uses memory value columns for its inputs and outputs.
-    final_cpu_row.mem_channels[0].value[0] = F::ZERO; // context
-    final_cpu_row.mem_channels[1].value[0] = F::from_canonical_usize(Segment::Code as usize); // segment
-    final_cpu_row.mem_channels[2].value[0] = F::ZERO; // virt
-    final_cpu_row.mem_channels[3].value[0] = F::from_canonical_usize(KERNEL.code.len()); // len
-    final_cpu_row.mem_channels[4].value = KERNEL.code_hash.map(F::from_canonical_u32);
-    final_cpu_row.mem_channels[4].value.reverse();
-    keccak_sponge_log(
-        state,
-        MemoryAddress::new(0, Segment::Code, 0),
-        KERNEL.code.clone(),
-    );
-    state.traces.push_cpu(final_cpu_row);
-    log::info!("Bootstrapping took {} cycles", state.traces.clock());
-}
-
-pub(crate) fn eval_bootstrap_kernel_packed<F: Field, P: PackedField<Scalar = F>>(
-    local_values: &CpuColumnsView<P>,
-    next_values: &CpuColumnsView<P>,
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    // IS_BOOTSTRAP_KERNEL must have an init value of 1, a final value of 0, and a delta in {0, -1}.
-    let local_is_bootstrap = local_values.is_bootstrap_kernel;
-    let next_is_bootstrap = next_values.is_bootstrap_kernel;
-    yield_constr.constraint_first_row(local_is_bootstrap - P::ONES);
-    yield_constr.constraint_last_row(local_is_bootstrap);
-    let delta_is_bootstrap = next_is_bootstrap - local_is_bootstrap;
-    yield_constr.constraint_transition(delta_is_bootstrap * (delta_is_bootstrap + P::ONES));
-
-    // If this is a bootloading row and the i'th memory channel is used, it must have the right
-    // address, name context = 0, segment = Code, virt = clock * NUM_GP_CHANNELS + i.
-    let code_segment = F::from_canonical_usize(Segment::Code as usize);
-    for (i, channel) in local_values.mem_channels.iter().enumerate() {
-        let filter = local_is_bootstrap * channel.used;
-        yield_constr.constraint(filter * channel.addr_context);
-        yield_constr.constraint(filter * (channel.addr_segment - code_segment));
-        let expected_virt = local_values.clock * F::from_canonical_usize(NUM_GP_CHANNELS)
-            + F::from_canonical_usize(i);
-        yield_constr.constraint(filter * (channel.addr_virtual - expected_virt));
-    }
-
-    // If this is the final bootstrap row (i.e. delta_is_bootstrap = 1), check that
-    // - all memory channels are disabled
-    // - the current kernel hash matches a precomputed one
-    for channel in local_values.mem_channels.iter() {
-        yield_constr.constraint_transition(delta_is_bootstrap * channel.used);
-    }
-    for (&expected, actual) in KERNEL
-        .code_hash
-        .iter()
-        .rev()
-        .zip(local_values.mem_channels.last().unwrap().value)
-    {
-        let expected = P::from(F::from_canonical_u32(expected));
-        let diff = expected - actual;
-        yield_constr.constraint_transition(delta_is_bootstrap * diff);
-    }
-}
-
-pub(crate) fn eval_bootstrap_kernel_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut CircuitBuilder<F, D>,
-    local_values: &CpuColumnsView<ExtensionTarget<D>>,
-    next_values: &CpuColumnsView<ExtensionTarget<D>>,
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    let one = builder.one_extension();
-
-    // IS_BOOTSTRAP_KERNEL must have an init value of 1, a final value of 0, and a delta in {0, -1}.
-    let local_is_bootstrap = local_values.is_bootstrap_kernel;
-    let next_is_bootstrap = next_values.is_bootstrap_kernel;
-    let constraint = builder.sub_extension(local_is_bootstrap, one);
-    yield_constr.constraint_first_row(builder, constraint);
-    yield_constr.constraint_last_row(builder, local_is_bootstrap);
-    let delta_is_bootstrap = builder.sub_extension(next_is_bootstrap, local_is_bootstrap);
-    let constraint =
-        builder.mul_add_extension(delta_is_bootstrap, delta_is_bootstrap, delta_is_bootstrap);
-    yield_constr.constraint_transition(builder, constraint);
-
-    // If this is a bootloading row and the i'th memory channel is used, it must have the right
-    // address, name context = 0, segment = Code, virt = clock * NUM_GP_CHANNELS + i.
-    let code_segment =
-        builder.constant_extension(F::Extension::from_canonical_usize(Segment::Code as usize));
-    for (i, channel) in local_values.mem_channels.iter().enumerate() {
-        let filter = builder.mul_extension(local_is_bootstrap, channel.used);
-        let constraint = builder.mul_extension(filter, channel.addr_context);
-        yield_constr.constraint(builder, constraint);
-
-        let segment_diff = builder.sub_extension(channel.addr_segment, code_segment);
-        let constraint = builder.mul_extension(filter, segment_diff);
-        yield_constr.constraint(builder, constraint);
-
-        let i_ext = builder.constant_extension(F::Extension::from_canonical_usize(i));
-        let num_gp_channels_f = F::from_canonical_usize(NUM_GP_CHANNELS);
-        let expected_virt =
-            builder.mul_const_add_extension(num_gp_channels_f, local_values.clock, i_ext);
-        let virt_diff = builder.sub_extension(channel.addr_virtual, expected_virt);
-        let constraint = builder.mul_extension(filter, virt_diff);
-        yield_constr.constraint(builder, constraint);
-    }
-
-    // If this is the final bootstrap row (i.e. delta_is_bootstrap = 1), check that
-    // - all memory channels are disabled
-    // - the current kernel hash matches a precomputed one
-    for channel in local_values.mem_channels.iter() {
-        let constraint = builder.mul_extension(delta_is_bootstrap, channel.used);
-        yield_constr.constraint_transition(builder, constraint);
-    }
-    for (&expected, actual) in KERNEL
-        .code_hash
-        .iter()
-        .rev()
-        .zip(local_values.mem_channels.last().unwrap().value)
-    {
-        let expected = builder.constant_extension(F::Extension::from_canonical_u32(expected));
-        let diff = builder.sub_extension(expected, actual);
-        let constraint = builder.mul_extension(delta_is_bootstrap, diff);
-        yield_constr.constraint_transition(builder, constraint);
-    }
-}
diff --git a/evm/src/cpu/byte_unpacking.rs b/evm/src/cpu/byte_unpacking.rs
new file mode 100644
index 0000000000..39053141d6
--- /dev/null
+++ b/evm/src/cpu/byte_unpacking.rs
@@ -0,0 +1,94 @@
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::field::types::Field;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::cpu::columns::CpuColumnsView;
+
+pub(crate) fn eval_packed<P: PackedField>(
+    lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    // The MSTORE_32BYTES opcodes are differentiated from MLOAD_32BYTES
+    // by the 5th bit set to 0.
+    let filter = lv.op.m_op_32bytes * (lv.opcode_bits[5] - P::ONES);
+
+    // The address to write to is stored in the first memory channel.
+    // It contains virt, segment, ctx in its first 3 limbs, and 0 otherwise.
+    // The new address is identical, except for its `virtual` limb that is increased by the corresponding `len` offset.
+    let new_addr = nv.mem_channels[0].value;
+    let written_addr = lv.mem_channels[0].value;
+
+    // Read len from opcode bits and constrain the pushed new offset.
+    let len_bits: P = lv.opcode_bits[..5]
+        .iter()
+        .enumerate()
+        .map(|(i, &bit)| bit * P::Scalar::from_canonical_u64(1 << i))
+        .sum();
+    let len = len_bits + P::ONES;
+
+    // Check that `virt` is increased properly.
+    yield_constr.constraint(filter * (new_addr[0] - written_addr[0] - len));
+
+    // Check that `segment` and `ctx` do not change.
+    yield_constr.constraint(filter * (new_addr[1] - written_addr[1]));
+    yield_constr.constraint(filter * (new_addr[2] - written_addr[2]));
+
+    // Check that the rest of the returned address is null.
+    for &limb in &new_addr[3..] {
+        yield_constr.constraint(filter * limb);
+    }
+}
+
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    // The MSTORE_32BYTES opcodes are differentiated from MLOAD_32BYTES
+    // by the 5th bit set to 0.
+    let filter =
+        builder.mul_sub_extension(lv.op.m_op_32bytes, lv.opcode_bits[5], lv.op.m_op_32bytes);
+
+    // The address to write to is stored in the first memory channel.
+    // It contains virt, segment, ctx in its first 3 limbs, and 0 otherwise.
+    // The new address is identical, except for its `virtual` limb that is increased by the corresponding `len` offset.
+    let new_addr = nv.mem_channels[0].value;
+    let written_addr = lv.mem_channels[0].value;
+
+    // Read len from opcode bits and constrain the pushed new offset.
+    let len_bits = lv.opcode_bits[..5].iter().enumerate().fold(
+        builder.zero_extension(),
+        |cumul, (i, &bit)| {
+            builder.mul_const_add_extension(F::from_canonical_u64(1 << i), bit, cumul)
+        },
+    );
+
+    // Check that `virt` is increased properly.
+    let diff = builder.sub_extension(new_addr[0], written_addr[0]);
+    let diff = builder.sub_extension(diff, len_bits);
+    let constr = builder.mul_sub_extension(filter, diff, filter);
+    yield_constr.constraint(builder, constr);
+
+    // Check that `segment` and `ctx` do not change.
+    {
+        let diff = builder.sub_extension(new_addr[1], written_addr[1]);
+        let constr = builder.mul_extension(filter, diff);
+        yield_constr.constraint(builder, constr);
+
+        let diff = builder.sub_extension(new_addr[2], written_addr[2]);
+        let constr = builder.mul_extension(filter, diff);
+        yield_constr.constraint(builder, constr);
+    }
+
+    // Check that the rest of the returned address is null.
+    for &limb in &new_addr[3..] {
+        let constr = builder.mul_extension(filter, limb);
+        yield_constr.constraint(builder, constr);
+    }
+}
diff --git a/evm/src/cpu/clock.rs b/evm/src/cpu/clock.rs
new file mode 100644
index 0000000000..cd7b17d8ed
--- /dev/null
+++ b/evm/src/cpu/clock.rs
@@ -0,0 +1,37 @@
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::cpu::columns::CpuColumnsView;
+
+/// Check the correct updating of `clock`.
+pub(crate) fn eval_packed<P: PackedField>(
+    lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    // The clock is 0 at the beginning.
+    yield_constr.constraint_first_row(lv.clock);
+    // The clock is incremented by 1 at each row.
+    yield_constr.constraint_transition(nv.clock - lv.clock - P::ONES);
+}
+
+/// Circuit version of `eval_packed`.
+/// Check the correct updating of `clock`.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    // The clock is 0 at the beginning.
+    yield_constr.constraint_first_row(builder, lv.clock);
+    // The clock is incremented by 1 at each row.
+    {
+        let new_clock = builder.add_const_extension(lv.clock, F::ONE);
+        let constr = builder.sub_extension(nv.clock, new_clock);
+        yield_constr.constraint_transition(builder, constr);
+    }
+}
diff --git a/evm/src/cpu/columns/general.rs b/evm/src/cpu/columns/general.rs
index d4f3447380..f565acc625 100644
--- a/evm/src/cpu/columns/general.rs
+++ b/evm/src/cpu/columns/general.rs
@@ -1,6 +1,6 @@
-use std::borrow::{Borrow, BorrowMut};
-use std::fmt::{Debug, Formatter};
-use std::mem::{size_of, transmute};
+use core::borrow::{Borrow, BorrowMut};
+use core::fmt::{Debug, Formatter};
+use core::mem::{size_of, transmute};
 
 /// General purpose columns, which can have different meanings depending on what CTL or other
 /// operation is occurring at this row.
@@ -14,58 +14,69 @@ pub(crate) union CpuGeneralColumnsView<T: Copy> {
 }
 
 impl<T: Copy> CpuGeneralColumnsView<T> {
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// View of the columns used for exceptions: they are the exception code bits.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn exception(&self) -> &CpuExceptionView<T> {
         unsafe { &self.exception }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// Mutable view of the column required for exceptions: they are the exception code bits.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn exception_mut(&mut self) -> &mut CpuExceptionView<T> {
         unsafe { &mut self.exception }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// View of the columns required for logic operations.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn logic(&self) -> &CpuLogicView<T> {
         unsafe { &self.logic }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// Mutable view of the columns required for logic operations.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn logic_mut(&mut self) -> &mut CpuLogicView<T> {
         unsafe { &mut self.logic }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// View of the columns required for jump operations.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn jumps(&self) -> &CpuJumpsView<T> {
         unsafe { &self.jumps }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// Mutable view of the columns required for jump operations.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn jumps_mut(&mut self) -> &mut CpuJumpsView<T> {
         unsafe { &mut self.jumps }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// View of the columns required for shift operations.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn shift(&self) -> &CpuShiftView<T> {
         unsafe { &self.shift }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// Mutable view of the columns required for shift operations.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn shift_mut(&mut self) -> &mut CpuShiftView<T> {
         unsafe { &mut self.shift }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// View of the columns required for the stack top.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn stack(&self) -> &CpuStackView<T> {
         unsafe { &self.stack }
     }
 
-    // SAFETY: Each view is a valid interpretation of the underlying array.
+    /// Mutable view of the columns required for the stack top.
+    /// SAFETY: Each view is a valid interpretation of the underlying array.
     pub(crate) fn stack_mut(&mut self) -> &mut CpuStackView<T> {
         unsafe { &mut self.stack }
     }
 }
 
 impl<T: Copy + PartialEq> PartialEq<Self> for CpuGeneralColumnsView<T> {
+    #[allow(clippy::unconditional_recursion)] // false positive
     fn eq(&self, other: &Self) -> bool {
         let self_arr: &[T; NUM_SHARED_COLUMNS] = self.borrow();
         let other_arr: &[T; NUM_SHARED_COLUMNS] = other.borrow();
@@ -94,41 +105,53 @@ impl<T: Copy> BorrowMut<[T; NUM_SHARED_COLUMNS]> for CpuGeneralColumnsView<T> {
     }
 }
 
+/// View of the first three `CpuGeneralColumns` containing exception code bits.
 #[derive(Copy, Clone)]
 pub(crate) struct CpuExceptionView<T: Copy> {
-    // Exception code as little-endian bits.
+    /// Exception code as little-endian bits.
     pub(crate) exc_code_bits: [T; 3],
 }
 
+/// View of the `CpuGeneralColumns` storing pseudo-inverses used to prove logic operations.
 #[derive(Copy, Clone)]
 pub(crate) struct CpuLogicView<T: Copy> {
-    // Pseudoinverse of `(input0 - input1)`. Used prove that they are unequal. Assumes 32-bit limbs.
+    /// Pseudoinverse of `(input0 - input1)`. Used prove that they are unequal. Assumes 32-bit limbs.
     pub(crate) diff_pinv: [T; 8],
 }
 
+/// View of the first two `CpuGeneralColumns` storing a flag and a pseudoinverse used to prove jumps.
 #[derive(Copy, Clone)]
 pub(crate) struct CpuJumpsView<T: Copy> {
-    // A flag.
+    /// A flag indicating whether a jump should occur.
     pub(crate) should_jump: T,
-    // Pseudoinverse of `cond.iter().sum()`. Used to check `should_jump`.
+    /// Pseudoinverse of `cond.iter().sum()`. Used to check `should_jump`.
     pub(crate) cond_sum_pinv: T,
 }
 
+/// View of the first `CpuGeneralColumns` storing a pseudoinverse used to prove shift operations.
 #[derive(Copy, Clone)]
 pub(crate) struct CpuShiftView<T: Copy> {
-    // For a shift amount of displacement: [T], this is the inverse of
-    // sum(displacement[1..]) or zero if the sum is zero.
+    /// For a shift amount of displacement: [T], this is the inverse of
+    /// sum(displacement[1..]) or zero if the sum is zero.
     pub(crate) high_limb_sum_inv: T,
 }
 
+/// View of the last four `CpuGeneralColumns` storing stack-related variables. The first three are used
+/// for conditionally enabling and disabling channels when reading the next `stack_top`, and the fourth one
+/// is used to check for stack overflow.
 #[derive(Copy, Clone)]
 pub(crate) struct CpuStackView<T: Copy> {
-    // Used for conditionally enabling and disabling channels when reading the next `stack_top`.
-    _unused: [T; 5],
+    _unused: [T; 4],
+    /// Pseudoinverse of `stack_len - num_pops`.
     pub(crate) stack_inv: T,
+    /// stack_inv * stack_len.
     pub(crate) stack_inv_aux: T,
+    /// Used to reduce the degree of stack constraints when needed.
     pub(crate) stack_inv_aux_2: T,
+    /// Pseudoinverse of `nv.stack_len - (MAX_USER_STACK_SIZE + 1)` to check for stack overflow.
+    pub(crate) stack_len_bounds_aux: T,
 }
 
-// `u8` is guaranteed to have a `size_of` of 1.
-pub const NUM_SHARED_COLUMNS: usize = size_of::<CpuGeneralColumnsView<u8>>();
+/// Number of columns shared by all the views of `CpuGeneralColumnsView`.
+/// `u8` is guaranteed to have a `size_of` of 1.
+pub(crate) const NUM_SHARED_COLUMNS: usize = size_of::<CpuGeneralColumnsView<u8>>();
diff --git a/evm/src/cpu/columns/mod.rs b/evm/src/cpu/columns/mod.rs
index b7b4f780e0..92da4e9979 100644
--- a/evm/src/cpu/columns/mod.rs
+++ b/evm/src/cpu/columns/mod.rs
@@ -1,7 +1,7 @@
-use std::borrow::{Borrow, BorrowMut};
-use std::fmt::Debug;
-use std::mem::{size_of, transmute};
-use std::ops::{Index, IndexMut};
+use core::borrow::{Borrow, BorrowMut};
+use core::fmt::Debug;
+use core::mem::{size_of, transmute};
+use core::ops::{Index, IndexMut};
 
 use plonky2::field::types::Field;
 
@@ -12,31 +12,48 @@ use crate::memory;
 use crate::util::{indices_arr, transmute_no_compile_time_size_checks};
 
 mod general;
+/// Cpu operation flags.
 pub(crate) mod ops;
 
+/// 32-bit limbs of the value stored in the current memory channel.
 pub type MemValue<T> = [T; memory::VALUE_LIMBS];
 
+/// View of the columns required for one memory channel.
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub struct MemoryChannelView<T: Copy> {
+pub(crate) struct MemoryChannelView<T: Copy> {
     /// 1 if this row includes a memory operation in the `i`th channel of the memory bus, otherwise
     /// 0.
     pub used: T,
+    /// 1 if a read is performed on the `i`th channel of the memory bus, otherwise 0.
     pub is_read: T,
+    /// Context of the memory operation in the `i`th channel of the memory bus.
     pub addr_context: T,
+    /// Segment of the memory operation in the `ith` channel of the memory bus.
     pub addr_segment: T,
+    /// Virtual address of the memory operation in the `ith` channel of the memory bus.
     pub addr_virtual: T,
+    /// Value, subdivided into 32-bit limbs, stored in the `ith` channel of the memory bus.
     pub value: MemValue<T>,
 }
 
+/// View of all the columns in `CpuStark`.
 #[repr(C)]
-#[derive(Clone, Copy, Eq, PartialEq, Debug)]
-pub struct CpuColumnsView<T: Copy> {
-    /// Filter. 1 if the row is part of bootstrapping the kernel code, 0 otherwise.
-    pub is_bootstrap_kernel: T,
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+// A more lightweight channel, sharing values with the 0-th memory channel
+// (which contains the top of the stack).
+pub(crate) struct PartialMemoryChannelView<T: Copy> {
+    pub used: T,
+    pub is_read: T,
+    pub addr_context: T,
+    pub addr_segment: T,
+    pub addr_virtual: T,
+}
 
+#[repr(C)]
+#[derive(Clone, Copy, Eq, PartialEq, Debug)]
+pub(crate) struct CpuColumnsView<T: Copy> {
     /// If CPU cycle: Current context.
-    // TODO: this is currently unconstrained
     pub context: T,
 
     /// If CPU cycle: Context for code memory channel.
@@ -48,15 +65,11 @@ pub struct CpuColumnsView<T: Copy> {
     /// If CPU cycle: The stack length.
     pub stack_len: T,
 
-    /// If CPU cycle: A prover-provided value needed to show that the instruction does not cause the
-    /// stack to underflow or overflow.
-    pub stack_len_bounds_aux: T,
-
     /// If CPU cycle: We're in kernel (privileged) mode.
     pub is_kernel_mode: T,
 
-    /// If CPU cycle: Gas counter, split in two 32-bit limbs in little-endian order.
-    pub gas: [T; 2],
+    /// If CPU cycle: Gas counter.
+    pub gas: T,
 
     /// If CPU cycle: flags for EVM instructions (a few cannot be shared; see the comments in
     /// `OpsColumnsView`).
@@ -65,17 +78,22 @@ pub struct CpuColumnsView<T: Copy> {
     /// If CPU cycle: the opcode, broken up into bits in little-endian order.
     pub opcode_bits: [T; 8],
 
-    /// Filter. 1 iff a Keccak sponge lookup is performed on this row.
-    pub is_keccak_sponge: T,
-
+    /// Columns shared by various operations.
     pub(crate) general: CpuGeneralColumnsView<T>,
 
+    /// CPU clock.
     pub(crate) clock: T,
+
+    /// Memory bus channels in the CPU.
+    /// Full channels are comprised of 13 columns.
     pub mem_channels: [MemoryChannelView<T>; NUM_GP_CHANNELS],
+    /// Partial channel is only comprised of 5 columns.
+    pub(crate) partial_channel: PartialMemoryChannelView<T>,
 }
 
-// `u8` is guaranteed to have a `size_of` of 1.
-pub const NUM_CPU_COLUMNS: usize = size_of::<CpuColumnsView<u8>>();
+/// Total number of columns in `CpuStark`.
+/// `u8` is guaranteed to have a `size_of` of 1.
+pub(crate) const NUM_CPU_COLUMNS: usize = size_of::<CpuColumnsView<u8>>();
 
 impl<F: Field> Default for CpuColumnsView<F> {
     fn default() -> Self {
@@ -146,4 +164,5 @@ const fn make_col_map() -> CpuColumnsView<usize> {
     unsafe { transmute::<[usize; NUM_CPU_COLUMNS], CpuColumnsView<usize>>(indices_arr) }
 }
 
-pub const COL_MAP: CpuColumnsView<usize> = make_col_map();
+/// Mapping between [0..NUM_CPU_COLUMNS-1] and the CPU columns.
+pub(crate) const COL_MAP: CpuColumnsView<usize> = make_col_map();
diff --git a/evm/src/cpu/columns/ops.rs b/evm/src/cpu/columns/ops.rs
index 270b0ab871..c15d657229 100644
--- a/evm/src/cpu/columns/ops.rs
+++ b/evm/src/cpu/columns/ops.rs
@@ -1,41 +1,55 @@
-use std::borrow::{Borrow, BorrowMut};
-use std::mem::{size_of, transmute};
-use std::ops::{Deref, DerefMut};
+use core::borrow::{Borrow, BorrowMut};
+use core::mem::{size_of, transmute};
+use core::ops::{Deref, DerefMut};
 
 use crate::util::transmute_no_compile_time_size_checks;
 
+/// Structure representing the flags for the various opcodes.
 #[repr(C)]
 #[derive(Clone, Copy, Eq, PartialEq, Debug)]
-pub struct OpsColumnsView<T: Copy> {
-    pub binary_op: T,  // Combines ADD, MUL, SUB, DIV, MOD, LT, GT and BYTE flags.
-    pub ternary_op: T, // Combines ADDMOD, MULMOD and SUBMOD flags.
-    pub fp254_op: T,   // Combines ADD_FP254, MUL_FP254 and SUB_FP254 flags.
-    pub eq_iszero: T,  // Combines EQ and ISZERO flags.
-    pub logic_op: T,   // Combines AND, OR and XOR flags.
-    pub not: T,
-    pub shift: T, // Combines SHL and SHR flags.
-    pub keccak_general: T,
-    pub prover_input: T,
-    pub pop: T,
-    pub jumps: T, // Combines JUMP and JUMPI flags.
-    pub pc: T,
-    pub jumpdest: T,
-    pub push0: T,
-    pub push: T,
+pub(crate) struct OpsColumnsView<T: Copy> {
+    /// Combines ADD, MUL, SUB, DIV, MOD, LT, GT and BYTE flags.
+    pub binary_op: T,
+    /// Combines ADDMOD, MULMOD and SUBMOD flags.
+    pub ternary_op: T,
+    /// Combines ADD_FP254, MUL_FP254 and SUB_FP254 flags.
+    pub fp254_op: T,
+    /// Combines EQ and ISZERO flags.
+    pub eq_iszero: T,
+    /// Combines AND, OR and XOR flags.
+    pub logic_op: T,
+    /// Combines NOT and POP flags.
+    pub not_pop: T,
+    /// Combines SHL and SHR flags.
+    pub shift: T,
+    /// Combines JUMPDEST and KECCAK_GENERAL flags.
+    pub jumpdest_keccak_general: T,
+    /// Combines JUMP and JUMPI flags.
+    pub jumps: T,
+    /// Combines PUSH and PROVER_INPUT flags.
+    pub push_prover_input: T,
+    /// Combines DUP and SWAP flags.
     pub dup_swap: T,
-    pub get_context: T,
-    pub set_context: T,
-    pub mstore_32bytes: T,
-    pub mload_32bytes: T,
+    /// Combines GET_CONTEXT and SET_CONTEXT flags.
+    pub context_op: T,
+    /// Combines MSTORE_32BYTES and MLOAD_32BYTES.
+    pub m_op_32bytes: T,
+    /// Flag for EXIT_KERNEL.
     pub exit_kernel: T,
+    /// Combines MSTORE_GENERAL and MLOAD_GENERAL flags.
     pub m_op_general: T,
+    /// Combines PC and PUSH0
+    pub pc_push0: T,
 
+    /// Flag for syscalls.
     pub syscall: T,
+    /// Flag for exceptions.
     pub exception: T,
 }
 
-// `u8` is guaranteed to have a `size_of` of 1.
-pub const NUM_OPS_COLUMNS: usize = size_of::<OpsColumnsView<u8>>();
+/// Number of columns in Cpu Stark.
+/// `u8` is guaranteed to have a `size_of` of 1.
+pub(crate) const NUM_OPS_COLUMNS: usize = size_of::<OpsColumnsView<u8>>();
 
 impl<T: Copy> From<[T; NUM_OPS_COLUMNS]> for OpsColumnsView<T> {
     fn from(value: [T; NUM_OPS_COLUMNS]) -> Self {
diff --git a/evm/src/cpu/contextops.rs b/evm/src/cpu/contextops.rs
index 1683c30e56..ec4e5e5e6e 100644
--- a/evm/src/cpu/contextops.rs
+++ b/evm/src/cpu/contextops.rs
@@ -1,3 +1,4 @@
+use itertools::izip;
 use plonky2::field::extension::Extendable;
 use plonky2::field::packed::PackedField;
 use plonky2::field::types::Field;
@@ -5,277 +6,339 @@ use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 
+use super::columns::ops::OpsColumnsView;
+use super::cpu_stark::{disable_unused_channels, disable_unused_channels_circuit};
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::CpuColumnsView;
-use crate::cpu::kernel::constants::context_metadata::ContextMetadata;
 use crate::memory::segments::Segment;
 
+// If true, the instruction will keep the current context for the next row.
+// If false, next row's context is handled manually.
+const KEEPS_CONTEXT: OpsColumnsView<bool> = OpsColumnsView {
+    binary_op: true,
+    ternary_op: true,
+    fp254_op: true,
+    eq_iszero: true,
+    logic_op: true,
+    not_pop: true,
+    shift: true,
+    jumpdest_keccak_general: true,
+    push_prover_input: true,
+    jumps: true,
+    pc_push0: true,
+    dup_swap: true,
+    context_op: false,
+    m_op_32bytes: true,
+    exit_kernel: true,
+    m_op_general: true,
+    syscall: true,
+    exception: true,
+};
+
+fn eval_packed_keep<P: PackedField>(
+    lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    for (op, keeps_context) in izip!(lv.op.into_iter(), KEEPS_CONTEXT.into_iter()) {
+        if keeps_context {
+            yield_constr.constraint_transition(op * (nv.context - lv.context));
+        }
+    }
+
+    // context_op is hybrid; we evaluate it separately.
+    let is_get_context = lv.op.context_op * (lv.opcode_bits[0] - P::ONES);
+    yield_constr.constraint_transition(is_get_context * (nv.context - lv.context));
+}
+
+fn eval_ext_circuit_keep<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    for (op, keeps_context) in izip!(lv.op.into_iter(), KEEPS_CONTEXT.into_iter()) {
+        if keeps_context {
+            let diff = builder.sub_extension(nv.context, lv.context);
+            let constr = builder.mul_extension(op, diff);
+            yield_constr.constraint_transition(builder, constr);
+        }
+    }
+
+    // context_op is hybrid; we evaluate it separately.
+    let is_get_context =
+        builder.mul_sub_extension(lv.op.context_op, lv.opcode_bits[0], lv.op.context_op);
+    let diff = builder.sub_extension(nv.context, lv.context);
+    let constr = builder.mul_extension(is_get_context, diff);
+    yield_constr.constraint_transition(builder, constr);
+}
+
+/// Evaluates constraints for GET_CONTEXT.
 fn eval_packed_get<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let filter = lv.op.get_context;
+    // If the opcode is GET_CONTEXT, then lv.opcode_bits[0] = 0.
+    let filter = lv.op.context_op * (P::ONES - lv.opcode_bits[0]);
     let new_stack_top = nv.mem_channels[0].value;
-    yield_constr.constraint(filter * (new_stack_top[0] - lv.context));
-    for &limb in &new_stack_top[1..] {
+    // Context is scaled by 2^64, hence stored in the 3rd limb.
+    yield_constr.constraint(filter * (new_stack_top[2] - lv.context));
+
+    for (_, &limb) in new_stack_top.iter().enumerate().filter(|(i, _)| *i != 2) {
         yield_constr.constraint(filter * limb);
     }
+
+    // Constrain new stack length.
+    yield_constr.constraint(filter * (nv.stack_len - (lv.stack_len + P::ONES)));
+
+    // Unused channels.
+    disable_unused_channels(lv, filter, vec![1], yield_constr);
+    yield_constr.constraint(filter * nv.mem_channels[0].used);
 }
 
+/// Circuit version of `eval_packed_get`.
+/// Evaluates constraints for GET_CONTEXT.
 fn eval_ext_circuit_get<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let filter = lv.op.get_context;
+    // If the opcode is GET_CONTEXT, then lv.opcode_bits[0] = 0.
+    let prod = builder.mul_extension(lv.op.context_op, lv.opcode_bits[0]);
+    let filter = builder.sub_extension(lv.op.context_op, prod);
     let new_stack_top = nv.mem_channels[0].value;
+    // Context is scaled by 2^64, hence stored in the 3rd limb.
     {
-        let diff = builder.sub_extension(new_stack_top[0], lv.context);
+        let diff = builder.sub_extension(new_stack_top[2], lv.context);
         let constr = builder.mul_extension(filter, diff);
         yield_constr.constraint(builder, constr);
     }
-    for &limb in &new_stack_top[1..] {
+
+    for (_, &limb) in new_stack_top.iter().enumerate().filter(|(i, _)| *i != 2) {
         let constr = builder.mul_extension(filter, limb);
         yield_constr.constraint(builder, constr);
     }
+
+    // Constrain new stack length.
+    {
+        let new_len = builder.add_const_extension(lv.stack_len, F::ONE);
+        let diff = builder.sub_extension(nv.stack_len, new_len);
+        let constr = builder.mul_extension(filter, diff);
+        yield_constr.constraint(builder, constr);
+    }
+
+    // Unused channels.
+    disable_unused_channels_circuit(builder, lv, filter, vec![1], yield_constr);
+    {
+        let constr = builder.mul_extension(filter, nv.mem_channels[0].used);
+        yield_constr.constraint(builder, constr);
+    }
 }
 
+/// Evaluates constraints for `SET_CONTEXT`.
 fn eval_packed_set<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let filter = lv.op.set_context;
+    let filter = lv.op.context_op * lv.opcode_bits[0];
     let stack_top = lv.mem_channels[0].value;
-    let write_old_sp_channel = lv.mem_channels[1];
-    let read_new_sp_channel = lv.mem_channels[2];
-    let ctx_metadata_segment = P::Scalar::from_canonical_u64(Segment::ContextMetadata as u64);
-    let stack_size_field = P::Scalar::from_canonical_u64(ContextMetadata::StackSize as u64);
-    let local_sp_dec = lv.stack_len - P::ONES;
 
     // The next row's context is read from stack_top.
-    yield_constr.constraint(filter * (stack_top[0] - nv.context));
-
-    // The old SP is decremented (since the new context was popped) and written to memory.
-    yield_constr.constraint(filter * (write_old_sp_channel.value[0] - local_sp_dec));
-    for limb in &write_old_sp_channel.value[1..] {
-        yield_constr.constraint(filter * *limb);
+    yield_constr.constraint(filter * (stack_top[2] - nv.context));
+    for (_, &limb) in stack_top.iter().enumerate().filter(|(i, _)| *i != 2) {
+        yield_constr.constraint(filter * limb);
     }
-    yield_constr.constraint(filter * (write_old_sp_channel.used - P::ONES));
-    yield_constr.constraint(filter * write_old_sp_channel.is_read);
-    yield_constr.constraint(filter * (write_old_sp_channel.addr_context - lv.context));
-    yield_constr.constraint(filter * (write_old_sp_channel.addr_segment - ctx_metadata_segment));
-    yield_constr.constraint(filter * (write_old_sp_channel.addr_virtual - stack_size_field));
 
+    // The old SP is decremented (since the new context was popped) and stored in memory.
     // The new SP is loaded from memory.
-    yield_constr.constraint(filter * (read_new_sp_channel.value[0] - nv.stack_len));
-    yield_constr.constraint(filter * (read_new_sp_channel.used - P::ONES));
-    yield_constr.constraint(filter * (read_new_sp_channel.is_read - P::ONES));
-    yield_constr.constraint(filter * (read_new_sp_channel.addr_context - nv.context));
-    yield_constr.constraint(filter * (read_new_sp_channel.addr_segment - ctx_metadata_segment));
-    yield_constr.constraint(filter * (read_new_sp_channel.addr_virtual - stack_size_field));
+    // This is all done with CTLs: nothing is constrained here.
 
-    // The next row's stack top is loaded from memory (if the stack isn't empty).
-    yield_constr.constraint(filter * nv.mem_channels[0].used);
-
-    let read_new_stack_top_channel = lv.mem_channels[3];
-    let stack_segment = P::Scalar::from_canonical_u64(Segment::Stack as u64);
-    let new_filter = filter * nv.stack_len;
-
-    for (limb_channel, limb_top) in read_new_stack_top_channel
+    // Constrain stack_inv_aux_2.
+    let new_top_channel = nv.mem_channels[0];
+    yield_constr.constraint(
+        lv.op.context_op
+            * (lv.general.stack().stack_inv_aux * lv.opcode_bits[0]
+                - lv.general.stack().stack_inv_aux_2),
+    );
+    // The new top is loaded in memory channel 2, if the stack isn't empty (see eval_packed).
+    for (&limb_new_top, &limb_read_top) in new_top_channel
         .value
         .iter()
-        .zip(nv.mem_channels[0].value)
+        .zip(lv.mem_channels[2].value.iter())
     {
-        yield_constr.constraint(new_filter * (*limb_channel - limb_top));
+        yield_constr.constraint(
+            lv.op.context_op * lv.general.stack().stack_inv_aux_2 * (limb_new_top - limb_read_top),
+        );
     }
-    yield_constr.constraint(new_filter * (read_new_stack_top_channel.used - P::ONES));
-    yield_constr.constraint(new_filter * (read_new_stack_top_channel.is_read - P::ONES));
-    yield_constr.constraint(new_filter * (read_new_stack_top_channel.addr_context - nv.context));
-    yield_constr.constraint(new_filter * (read_new_stack_top_channel.addr_segment - stack_segment));
-    yield_constr.constraint(
-        new_filter * (read_new_stack_top_channel.addr_virtual - (nv.stack_len - P::ONES)),
-    );
 
-    // If the new stack is empty, disable the channel read.
-    yield_constr.constraint(
-        filter * (nv.stack_len * lv.general.stack().stack_inv - lv.general.stack().stack_inv_aux),
-    );
-    let empty_stack_filter = filter * (lv.general.stack().stack_inv_aux - P::ONES);
-    yield_constr.constraint(empty_stack_filter * read_new_stack_top_channel.used);
+    // Unused channels.
+    disable_unused_channels(lv, filter, vec![1], yield_constr);
+    yield_constr.constraint(filter * new_top_channel.used);
 }
 
+/// Circuit version of `eval_packed_set`.
+/// Evaluates constraints for SET_CONTEXT.
 fn eval_ext_circuit_set<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let filter = lv.op.set_context;
+    let filter = builder.mul_extension(lv.op.context_op, lv.opcode_bits[0]);
     let stack_top = lv.mem_channels[0].value;
-    let write_old_sp_channel = lv.mem_channels[1];
-    let read_new_sp_channel = lv.mem_channels[2];
-    let ctx_metadata_segment = builder.constant_extension(F::Extension::from_canonical_u32(
-        Segment::ContextMetadata as u32,
-    ));
-    let stack_size_field = builder.constant_extension(F::Extension::from_canonical_u32(
-        ContextMetadata::StackSize as u32,
-    ));
-    let one = builder.one_extension();
-    let local_sp_dec = builder.sub_extension(lv.stack_len, one);
 
     // The next row's context is read from stack_top.
     {
-        let diff = builder.sub_extension(stack_top[0], nv.context);
+        let diff = builder.sub_extension(stack_top[2], nv.context);
         let constr = builder.mul_extension(filter, diff);
         yield_constr.constraint(builder, constr);
     }
-
-    // The old SP is decremented (since the new context was popped) and written to memory.
-    {
-        let diff = builder.sub_extension(write_old_sp_channel.value[0], local_sp_dec);
-        let constr = builder.mul_extension(filter, diff);
-        yield_constr.constraint(builder, constr);
-    }
-    for limb in &write_old_sp_channel.value[1..] {
-        let constr = builder.mul_extension(filter, *limb);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let constr = builder.mul_sub_extension(filter, write_old_sp_channel.used, filter);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let constr = builder.mul_extension(filter, write_old_sp_channel.is_read);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let diff = builder.sub_extension(write_old_sp_channel.addr_context, lv.context);
-        let constr = builder.mul_extension(filter, diff);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let diff = builder.sub_extension(write_old_sp_channel.addr_segment, ctx_metadata_segment);
-        let constr = builder.mul_extension(filter, diff);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let diff = builder.sub_extension(write_old_sp_channel.addr_virtual, stack_size_field);
-        let constr = builder.mul_extension(filter, diff);
+    for (_, &limb) in stack_top.iter().enumerate().filter(|(i, _)| *i != 2) {
+        let constr = builder.mul_extension(filter, limb);
         yield_constr.constraint(builder, constr);
     }
 
+    // The old SP is decremented (since the new context was popped) and stored in memory.
     // The new SP is loaded from memory.
+    // This is all done with CTLs: nothing is constrained here.
+
+    // Constrain stack_inv_aux_2.
+    let new_top_channel = nv.mem_channels[0];
     {
-        let diff = builder.sub_extension(read_new_sp_channel.value[0], nv.stack_len);
-        let constr = builder.mul_extension(filter, diff);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let constr = builder.mul_sub_extension(filter, read_new_sp_channel.used, filter);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let constr = builder.mul_sub_extension(filter, read_new_sp_channel.is_read, filter);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let diff = builder.sub_extension(read_new_sp_channel.addr_context, nv.context);
-        let constr = builder.mul_extension(filter, diff);
-        yield_constr.constraint(builder, constr);
-    }
-    {
-        let diff = builder.sub_extension(read_new_sp_channel.addr_segment, ctx_metadata_segment);
-        let constr = builder.mul_extension(filter, diff);
+        let diff = builder.mul_sub_extension(
+            lv.general.stack().stack_inv_aux,
+            lv.opcode_bits[0],
+            lv.general.stack().stack_inv_aux_2,
+        );
+        let constr = builder.mul_extension(lv.op.context_op, diff);
         yield_constr.constraint(builder, constr);
     }
+    // The new top is loaded in memory channel 2, if the stack isn't empty (see eval_packed).
+    for (&limb_new_top, &limb_read_top) in new_top_channel
+        .value
+        .iter()
+        .zip(lv.mem_channels[2].value.iter())
     {
-        let diff = builder.sub_extension(read_new_sp_channel.addr_virtual, stack_size_field);
-        let constr = builder.mul_extension(filter, diff);
+        let diff = builder.sub_extension(limb_new_top, limb_read_top);
+        let prod = builder.mul_extension(lv.general.stack().stack_inv_aux_2, diff);
+        let constr = builder.mul_extension(lv.op.context_op, prod);
         yield_constr.constraint(builder, constr);
     }
 
-    // The next row's stack top is loaded from memory (if the stack isn't empty).
+    // Unused channels.
+    disable_unused_channels_circuit(builder, lv, filter, vec![1], yield_constr);
     {
-        let constr = builder.mul_extension(filter, nv.mem_channels[0].used);
+        let constr = builder.mul_extension(filter, new_top_channel.used);
         yield_constr.constraint(builder, constr);
     }
+}
 
-    let read_new_stack_top_channel = lv.mem_channels[3];
-    let stack_segment =
-        builder.constant_extension(F::Extension::from_canonical_u32(Segment::Stack as u32));
+/// Evaluates the constraints for the GET and SET opcodes.
+pub(crate) fn eval_packed<P: PackedField>(
+    lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    eval_packed_keep(lv, nv, yield_constr);
+    eval_packed_get(lv, nv, yield_constr);
+    eval_packed_set(lv, nv, yield_constr);
 
-    let new_filter = builder.mul_extension(filter, nv.stack_len);
+    // Stack constraints.
+    // Both operations use memory channel 2. The operations are similar enough that
+    // we can constrain both at the same time.
+    let filter = lv.op.context_op;
+    let channel = lv.mem_channels[2];
+    // For get_context, we check if lv.stack_len is 0. For set_context, we check if nv.stack_len is 0.
+    // However, for get_context, we can deduce lv.stack_len from nv.stack_len since the operation only pushes.
+    let stack_len = nv.stack_len - (P::ONES - lv.opcode_bits[0]);
+    // Constrain stack_inv_aux. It's 0 if the relevant stack is empty, 1 otherwise.
+    yield_constr.constraint(
+        filter * (stack_len * lv.general.stack().stack_inv - lv.general.stack().stack_inv_aux),
+    );
+    // Enable or disable the channel.
+    yield_constr.constraint(filter * (lv.general.stack().stack_inv_aux - channel.used));
+    let new_filter = filter * lv.general.stack().stack_inv_aux;
+    // It's a write for get_context, a read for set_context.
+    yield_constr.constraint(new_filter * (channel.is_read - lv.opcode_bits[0]));
+    // In both cases, next row's context works.
+    yield_constr.constraint(new_filter * (channel.addr_context - nv.context));
+    // Same segment for both.
+    yield_constr.constraint(
+        new_filter
+            * (channel.addr_segment - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
+    );
+    // The address is one less than stack_len.
+    let addr_virtual = stack_len - P::ONES;
+    yield_constr.constraint(new_filter * (channel.addr_virtual - addr_virtual));
+}
 
-    for (limb_channel, limb_top) in read_new_stack_top_channel
-        .value
-        .iter()
-        .zip(nv.mem_channels[0].value)
-    {
-        let diff = builder.sub_extension(*limb_channel, limb_top);
-        let constr = builder.mul_extension(new_filter, diff);
-        yield_constr.constraint(builder, constr);
-    }
+/// Circuit version of èval_packed`.
+/// Evaluates the constraints for the GET and SET opcodes.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    eval_ext_circuit_keep(builder, lv, nv, yield_constr);
+    eval_ext_circuit_get(builder, lv, nv, yield_constr);
+    eval_ext_circuit_set(builder, lv, nv, yield_constr);
+
+    // Stack constraints.
+    // Both operations use memory channel 2. The operations are similar enough that
+    // we can constrain both at the same time.
+    let filter = lv.op.context_op;
+    let channel = lv.mem_channels[2];
+    // For get_context, we check if lv.stack_len is 0. For set_context, we check if nv.stack_len is 0.
+    // However, for get_context, we can deduce lv.stack_len from nv.stack_len since the operation only pushes.
+    let diff = builder.add_const_extension(lv.opcode_bits[0], -F::ONE);
+    let stack_len = builder.add_extension(nv.stack_len, diff);
+    // Constrain stack_inv_aux. It's 0 if the relevant stack is empty, 1 otherwise.
     {
-        let constr =
-            builder.mul_sub_extension(new_filter, read_new_stack_top_channel.used, new_filter);
+        let diff = builder.mul_sub_extension(
+            stack_len,
+            lv.general.stack().stack_inv,
+            lv.general.stack().stack_inv_aux,
+        );
+        let constr = builder.mul_extension(filter, diff);
         yield_constr.constraint(builder, constr);
     }
+    // Enable or disable the channel.
     {
-        let constr =
-            builder.mul_sub_extension(new_filter, read_new_stack_top_channel.is_read, new_filter);
+        let diff = builder.sub_extension(lv.general.stack().stack_inv_aux, channel.used);
+        let constr = builder.mul_extension(filter, diff);
         yield_constr.constraint(builder, constr);
     }
+    let new_filter = builder.mul_extension(filter, lv.general.stack().stack_inv_aux);
+    // It's a write for get_context, a read for set_context.
     {
-        let diff = builder.sub_extension(read_new_stack_top_channel.addr_context, nv.context);
+        let diff = builder.sub_extension(channel.is_read, lv.opcode_bits[0]);
         let constr = builder.mul_extension(new_filter, diff);
         yield_constr.constraint(builder, constr);
     }
+    // In both cases, next row's context works.
     {
-        let diff = builder.sub_extension(read_new_stack_top_channel.addr_segment, stack_segment);
+        let diff = builder.sub_extension(channel.addr_context, nv.context);
         let constr = builder.mul_extension(new_filter, diff);
         yield_constr.constraint(builder, constr);
     }
+    // Same segment for both.
     {
-        let diff = builder.sub_extension(nv.stack_len, one);
-        let diff = builder.sub_extension(read_new_stack_top_channel.addr_virtual, diff);
+        let diff = builder.add_const_extension(
+            channel.addr_segment,
+            -F::from_canonical_usize(Segment::Stack.unscale()),
+        );
         let constr = builder.mul_extension(new_filter, diff);
         yield_constr.constraint(builder, constr);
     }
-
-    // If the new stack is empty, disable the channel read.
+    // The address is one less than stack_len.
     {
-        let diff = builder.mul_extension(nv.stack_len, lv.general.stack().stack_inv);
-        let diff = builder.sub_extension(diff, lv.general.stack().stack_inv_aux);
-        let constr = builder.mul_extension(filter, diff);
-        yield_constr.constraint(builder, constr);
-    }
-
-    {
-        let empty_stack_filter =
-            builder.mul_sub_extension(filter, lv.general.stack().stack_inv_aux, filter);
-        let constr = builder.mul_extension(empty_stack_filter, read_new_stack_top_channel.used);
+        let addr_virtual = builder.add_const_extension(stack_len, -F::ONE);
+        let diff = builder.sub_extension(channel.addr_virtual, addr_virtual);
+        let constr = builder.mul_extension(new_filter, diff);
         yield_constr.constraint(builder, constr);
     }
 }
-
-pub fn eval_packed<P: PackedField>(
-    lv: &CpuColumnsView<P>,
-    nv: &CpuColumnsView<P>,
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    eval_packed_get(lv, nv, yield_constr);
-    eval_packed_set(lv, nv, yield_constr);
-}
-
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut CircuitBuilder<F, D>,
-    lv: &CpuColumnsView<ExtensionTarget<D>>,
-    nv: &CpuColumnsView<ExtensionTarget<D>>,
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    eval_ext_circuit_get(builder, lv, nv, yield_constr);
-    eval_ext_circuit_set(builder, lv, nv, yield_constr);
-}
diff --git a/evm/src/cpu/control_flow.rs b/evm/src/cpu/control_flow.rs
index 2f496b514a..bde5930572 100644
--- a/evm/src/cpu/control_flow.rs
+++ b/evm/src/cpu/control_flow.rs
@@ -8,43 +8,42 @@ use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer
 use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 use crate::cpu::kernel::aggregator::KERNEL;
 
-const NATIVE_INSTRUCTIONS: [usize; 17] = [
+const NATIVE_INSTRUCTIONS: [usize; 12] = [
     COL_MAP.op.binary_op,
     COL_MAP.op.ternary_op,
     COL_MAP.op.fp254_op,
     COL_MAP.op.eq_iszero,
     COL_MAP.op.logic_op,
-    COL_MAP.op.not,
+    COL_MAP.op.not_pop,
     COL_MAP.op.shift,
-    COL_MAP.op.keccak_general,
-    COL_MAP.op.prover_input,
-    COL_MAP.op.pop,
+    COL_MAP.op.jumpdest_keccak_general,
+    // Not PROVER_INPUT: it is dealt with manually below.
     // not JUMPS (possible need to jump)
-    COL_MAP.op.pc,
-    COL_MAP.op.jumpdest,
-    COL_MAP.op.push0,
+    COL_MAP.op.pc_push0,
     // not PUSH (need to increment by more than 1)
     COL_MAP.op.dup_swap,
-    COL_MAP.op.get_context,
-    COL_MAP.op.set_context,
+    COL_MAP.op.context_op,
     // not EXIT_KERNEL (performs a jump)
     COL_MAP.op.m_op_general,
     // not SYSCALL (performs a jump)
     // not exceptions (also jump)
 ];
 
+/// Returns `halt`'s program counter.
 pub(crate) fn get_halt_pc<F: Field>() -> F {
     let halt_pc = KERNEL.global_labels["halt"];
     F::from_canonical_usize(halt_pc)
 }
 
+/// Returns `main`'s program counter.
 pub(crate) fn get_start_pc<F: Field>() -> F {
     let start_pc = KERNEL.global_labels["main"];
 
     F::from_canonical_usize(start_pc)
 }
 
-pub fn eval_packed_generic<P: PackedField>(
+/// Evaluates the constraints related to the flow of instructions.
+pub(crate) fn eval_packed_generic<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -52,7 +51,7 @@ pub fn eval_packed_generic<P: PackedField>(
     let is_cpu_cycle: P = COL_MAP.op.iter().map(|&col_i| lv[col_i]).sum();
     let is_cpu_cycle_next: P = COL_MAP.op.iter().map(|&col_i| nv[col_i]).sum();
 
-    let next_halt_state = P::ONES - nv.is_bootstrap_kernel - is_cpu_cycle_next;
+    let next_halt_state = P::ONES - is_cpu_cycle_next;
 
     // Once we start executing instructions, then we continue until the end of the table
     // or we reach dummy padding rows. This, along with the constraints on the first row,
@@ -71,6 +70,13 @@ pub fn eval_packed_generic<P: PackedField>(
     yield_constr
         .constraint_transition(is_native_instruction * (lv.is_kernel_mode - nv.is_kernel_mode));
 
+    // Apply the same checks as before, for PROVER_INPUT.
+    let is_prover_input: P = lv.op.push_prover_input * (lv.opcode_bits[5] - P::ONES);
+    yield_constr.constraint_transition(
+        is_prover_input * (lv.program_counter - nv.program_counter + P::ONES),
+    );
+    yield_constr.constraint_transition(is_prover_input * (lv.is_kernel_mode - nv.is_kernel_mode));
+
     // If a non-CPU cycle row is followed by a CPU cycle row, then:
     //  - the `program_counter` of the CPU cycle row is `main` (the entry point of our kernel),
     //  - execution is in kernel mode, and
@@ -82,7 +88,9 @@ pub fn eval_packed_generic<P: PackedField>(
     yield_constr.constraint_transition(is_last_noncpu_cycle * nv.stack_len);
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates the constraints related to the flow of instructions.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -93,8 +101,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let is_cpu_cycle = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| lv[col_i]));
     let is_cpu_cycle_next = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| nv[col_i]));
 
-    let next_halt_state = builder.add_extension(nv.is_bootstrap_kernel, is_cpu_cycle_next);
-    let next_halt_state = builder.sub_extension(one, next_halt_state);
+    let next_halt_state = builder.sub_extension(one, is_cpu_cycle_next);
 
     // Once we start executing instructions, then we continue until the end of the table
     // or we reach dummy padding rows. This, along with the constraints on the first row,
@@ -117,6 +124,17 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let kernel_diff = builder.sub_extension(lv.is_kernel_mode, nv.is_kernel_mode);
         let kernel_constr = builder.mul_extension(filter, kernel_diff);
         yield_constr.constraint_transition(builder, kernel_constr);
+
+        // Same constraints as before, for PROVER_INPUT.
+        let is_prover_input = builder.mul_sub_extension(
+            lv.op.push_prover_input,
+            lv.opcode_bits[5],
+            lv.op.push_prover_input,
+        );
+        let pc_constr = builder.mul_add_extension(is_prover_input, pc_diff, is_prover_input);
+        yield_constr.constraint_transition(builder, pc_constr);
+        let kernel_constr = builder.mul_extension(is_prover_input, kernel_diff);
+        yield_constr.constraint_transition(builder, kernel_constr);
     }
 
     // If a non-CPU cycle row is followed by a CPU cycle row, then:
diff --git a/evm/src/cpu/cpu_stark.rs b/evm/src/cpu/cpu_stark.rs
index 64a2db9c36..8bcada2f3b 100644
--- a/evm/src/cpu/cpu_stark.rs
+++ b/evm/src/cpu/cpu_stark.rs
@@ -1,6 +1,6 @@
-use std::borrow::Borrow;
-use std::iter::repeat;
-use std::marker::PhantomData;
+use core::borrow::Borrow;
+use core::iter::repeat;
+use core::marker::PhantomData;
 
 use itertools::Itertools;
 use plonky2::field::extension::{Extendable, FieldExtension};
@@ -11,81 +11,91 @@ use plonky2::iop::ext_target::ExtensionTarget;
 
 use super::columns::CpuColumnsView;
 use super::halt;
+use super::kernel::constants::context_metadata::ContextMetadata;
+use super::membus::NUM_GP_CHANNELS;
 use crate::all_stark::Table;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::{COL_MAP, NUM_CPU_COLUMNS};
-use crate::cpu::membus::NUM_GP_CHANNELS;
 use crate::cpu::{
-    bootstrap_kernel, contextops, control_flow, decode, dup_swap, gas, jumps, membus, memio,
-    modfp254, pc, push0, shift, simple_logic, stack, stack_bounds, syscalls_exceptions,
+    byte_unpacking, clock, contextops, control_flow, decode, dup_swap, gas, jumps, membus, memio,
+    modfp254, pc, push0, shift, simple_logic, stack, syscalls_exceptions,
 };
-use crate::cross_table_lookup::{Column, TableWithColumns};
+use crate::cross_table_lookup::TableWithColumns;
 use crate::evaluation_frame::{StarkEvaluationFrame, StarkFrame};
+use crate::lookup::{Column, Filter};
 use crate::memory::segments::Segment;
 use crate::memory::{NUM_CHANNELS, VALUE_LIMBS};
 use crate::stark::Stark;
 
-pub fn ctl_data_keccak_sponge<F: Field>() -> Vec<Column<F>> {
+/// Creates the vector of `Columns` corresponding to the General Purpose channels when calling the Keccak sponge:
+/// the CPU reads the output of the sponge directly from the `KeccakSpongeStark` table.
+pub(crate) fn ctl_data_keccak_sponge<F: Field>() -> Vec<Column<F>> {
     // When executing KECCAK_GENERAL, the GP memory channels are used as follows:
-    // GP channel 0: stack[-1] = context
-    // GP channel 1: stack[-2] = segment
-    // GP channel 2: stack[-3] = virt
-    // GP channel 3: stack[-4] = len
-    // GP channel 4: pushed = outputs
-    let context = Column::single(COL_MAP.mem_channels[0].value[0]);
-    let segment = Column::single(COL_MAP.mem_channels[1].value[0]);
-    let virt = Column::single(COL_MAP.mem_channels[2].value[0]);
-    let len = Column::single(COL_MAP.mem_channels[3].value[0]);
+    // GP channel 0: stack[-1] = addr (context, segment, virt)
+    // GP channel 1: stack[-2] = len
+    // Next GP channel 0: pushed = outputs
+    let (context, segment, virt) = get_addr(&COL_MAP, 0);
+    let context = Column::single(context);
+    let segment = Column::single(segment);
+    let virt = Column::single(virt);
+    let len = Column::single(COL_MAP.mem_channels[1].value[0]);
 
     let num_channels = F::from_canonical_usize(NUM_CHANNELS);
     let timestamp = Column::linear_combination([(COL_MAP.clock, num_channels)]);
 
     let mut cols = vec![context, segment, virt, len, timestamp];
-    cols.extend(COL_MAP.mem_channels[4].value.map(Column::single));
+    cols.extend(Column::singles_next_row(COL_MAP.mem_channels[0].value));
     cols
 }
 
-pub fn ctl_filter_keccak_sponge<F: Field>() -> Column<F> {
-    Column::single(COL_MAP.is_keccak_sponge)
+/// CTL filter for a call to the Keccak sponge.
+// KECCAK_GENERAL is differentiated from JUMPDEST by its second bit set to 0.
+pub(crate) fn ctl_filter_keccak_sponge<F: Field>() -> Filter<F> {
+    Filter::new(
+        vec![(
+            Column::single(COL_MAP.op.jumpdest_keccak_general),
+            Column::linear_combination_with_constant([(COL_MAP.opcode_bits[1], -F::ONE)], F::ONE),
+        )],
+        vec![],
+    )
 }
 
-/// Create the vector of Columns corresponding to the two inputs and
+/// Creates the vector of `Columns` corresponding to the two inputs and
 /// one output of a binary operation.
 fn ctl_data_binops<F: Field>() -> Vec<Column<F>> {
     let mut res = Column::singles(COL_MAP.mem_channels[0].value).collect_vec();
     res.extend(Column::singles(COL_MAP.mem_channels[1].value));
-    res.extend(Column::singles(
-        COL_MAP.mem_channels[NUM_GP_CHANNELS - 1].value,
-    ));
+    res.extend(Column::singles_next_row(COL_MAP.mem_channels[0].value));
     res
 }
 
-/// Create the vector of Columns corresponding to the three inputs and
+/// Creates the vector of `Columns` corresponding to the three inputs and
 /// one output of a ternary operation. By default, ternary operations use
-/// the first three memory channels, and the last one for the result (binary
-/// operations do not use the third inputs).
+/// the first three memory channels, and the next top of the stack for the
+/// result (binary operations do not use the third inputs).
 fn ctl_data_ternops<F: Field>() -> Vec<Column<F>> {
     let mut res = Column::singles(COL_MAP.mem_channels[0].value).collect_vec();
     res.extend(Column::singles(COL_MAP.mem_channels[1].value));
     res.extend(Column::singles(COL_MAP.mem_channels[2].value));
-    res.extend(Column::singles(
-        COL_MAP.mem_channels[NUM_GP_CHANNELS - 1].value,
-    ));
+    res.extend(Column::singles_next_row(COL_MAP.mem_channels[0].value));
     res
 }
 
-pub fn ctl_data_logic<F: Field>() -> Vec<Column<F>> {
+/// Creates the vector of columns corresponding to the opcode, the two inputs and the output of the logic operation.
+pub(crate) fn ctl_data_logic<F: Field>() -> Vec<Column<F>> {
     // Instead of taking single columns, we reconstruct the entire opcode value directly.
     let mut res = vec![Column::le_bits(COL_MAP.opcode_bits)];
     res.extend(ctl_data_binops());
     res
 }
 
-pub fn ctl_filter_logic<F: Field>() -> Column<F> {
-    Column::single(COL_MAP.op.logic_op)
+/// CTL filter for logic operations.
+pub(crate) fn ctl_filter_logic<F: Field>() -> Filter<F> {
+    Filter::new_simple(Column::single(COL_MAP.op.logic_op))
 }
 
-pub fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
+/// Returns the `TableWithColumns` for the CPU rows calling arithmetic operations.
+pub(crate) fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
     // Instead of taking single columns, we reconstruct the entire opcode value directly.
     let mut columns = vec![Column::le_bits(COL_MAP.opcode_bits)];
     columns.extend(ctl_data_ternops());
@@ -94,54 +104,177 @@ pub fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
     // (also `ops` is used as the operation filter). The list of
     // operations includes binary operations which will simply ignore
     // the third input.
+    let col_bit = Column::linear_combination_with_constant(
+        vec![(COL_MAP.opcode_bits[5], F::NEG_ONE)],
+        F::ONE,
+    );
     TableWithColumns::new(
-        Table::Cpu,
+        *Table::Cpu,
         columns,
-        Some(Column::sum([
-            COL_MAP.op.binary_op,
-            COL_MAP.op.fp254_op,
-            COL_MAP.op.ternary_op,
-            COL_MAP.op.shift,
-        ])),
+        Some(Filter::new(
+            vec![(Column::single(COL_MAP.op.push_prover_input), col_bit)],
+            vec![Column::sum([
+                COL_MAP.op.binary_op,
+                COL_MAP.op.fp254_op,
+                COL_MAP.op.ternary_op,
+                COL_MAP.op.shift,
+                COL_MAP.op.syscall,
+                COL_MAP.op.exception,
+            ])],
+        )),
     )
 }
 
-pub fn ctl_data_byte_packing<F: Field>() -> Vec<Column<F>> {
-    ctl_data_keccak_sponge()
+/// Creates the vector of `Columns` corresponding to the contents of General Purpose channels when calling byte packing.
+/// We use `ctl_data_keccak_sponge` because the `Columns` are the same as the ones computed for `KeccakSpongeStark`.
+pub(crate) fn ctl_data_byte_packing<F: Field>() -> Vec<Column<F>> {
+    let mut res = vec![Column::constant(F::ONE)]; // is_read
+    res.extend(ctl_data_keccak_sponge());
+    res
 }
 
-pub fn ctl_filter_byte_packing<F: Field>() -> Column<F> {
-    Column::single(COL_MAP.op.mload_32bytes)
+/// CTL filter for the `MLOAD_32BYTES` operation.
+/// MLOAD_32 BYTES is differentiated from MSTORE_32BYTES by its fifth bit set to 1.
+pub(crate) fn ctl_filter_byte_packing<F: Field>() -> Filter<F> {
+    Filter::new(
+        vec![(
+            Column::single(COL_MAP.op.m_op_32bytes),
+            Column::single(COL_MAP.opcode_bits[5]),
+        )],
+        vec![],
+    )
 }
 
-pub fn ctl_data_byte_unpacking<F: Field>() -> Vec<Column<F>> {
+/// Creates the vector of `Columns` corresponding to the contents of General Purpose channels when calling byte unpacking.
+pub(crate) fn ctl_data_byte_unpacking<F: Field>() -> Vec<Column<F>> {
+    let is_read = Column::constant(F::ZERO);
+
     // When executing MSTORE_32BYTES, the GP memory channels are used as follows:
-    // GP channel 0: stack[-1] = context
-    // GP channel 1: stack[-2] = segment
-    // GP channel 2: stack[-3] = virt
-    // GP channel 3: stack[-4] = val
-    // GP channel 4: stack[-5] = len
-    let context = Column::single(COL_MAP.mem_channels[0].value[0]);
-    let segment = Column::single(COL_MAP.mem_channels[1].value[0]);
-    let virt = Column::single(COL_MAP.mem_channels[2].value[0]);
-    let val = Column::singles(COL_MAP.mem_channels[3].value);
-    let len = Column::single(COL_MAP.mem_channels[4].value[0]);
+    // GP channel 0: stack[-1] = addr (context, segment, virt)
+    // GP channel 1: stack[-2] = val
+    // Next GP channel 0: pushed = new_offset (virt + len)
+    let (context, segment, virt) = get_addr(&COL_MAP, 0);
+    let mut res = vec![
+        is_read,
+        Column::single(context),
+        Column::single(segment),
+        Column::single(virt),
+    ];
+
+    // len can be reconstructed as new_offset - virt.
+    let len = Column::linear_combination_and_next_row_with_constant(
+        [(COL_MAP.mem_channels[0].value[0], -F::ONE)],
+        [(COL_MAP.mem_channels[0].value[0], F::ONE)],
+        F::ZERO,
+    );
+    res.push(len);
+
+    let num_channels = F::from_canonical_usize(NUM_CHANNELS);
+    let timestamp = Column::linear_combination([(COL_MAP.clock, num_channels)]);
+    res.push(timestamp);
+
+    let val = Column::singles(COL_MAP.mem_channels[1].value);
+    res.extend(val);
+
+    res
+}
+
+/// CTL filter for the `MSTORE_32BYTES` operation.
+/// MSTORE_32BYTES is differentiated from MLOAD_32BYTES by its fifth bit set to 0.
+pub(crate) fn ctl_filter_byte_unpacking<F: Field>() -> Filter<F> {
+    Filter::new(
+        vec![(
+            Column::single(COL_MAP.op.m_op_32bytes),
+            Column::linear_combination_with_constant([(COL_MAP.opcode_bits[5], -F::ONE)], F::ONE),
+        )],
+        vec![],
+    )
+}
+
+/// Creates the vector of `Columns` corresponding to three consecutive (byte) reads in memory.
+/// It's used by syscalls and exceptions to read an address in a jumptable.
+pub(crate) fn ctl_data_jumptable_read<F: Field>() -> Vec<Column<F>> {
+    let is_read = Column::constant(F::ONE);
+    let mut res = vec![is_read];
+
+    // When reading the jumptable, the address to start reading from is in
+    // GP channel 1; the result is in GP channel 1's values.
+    let channel_map = COL_MAP.mem_channels[1];
+    res.extend(Column::singles([
+        channel_map.addr_context,
+        channel_map.addr_segment,
+        channel_map.addr_virtual,
+    ]));
+    let val = Column::singles(channel_map.value);
+
+    // len is always 3.
+    let len = Column::constant(F::from_canonical_usize(3));
+    res.push(len);
+
+    let num_channels = F::from_canonical_usize(NUM_CHANNELS);
+    let timestamp = Column::linear_combination([(COL_MAP.clock, num_channels)]);
+    res.push(timestamp);
+
+    res.extend(val);
+
+    res
+}
+
+/// CTL filter for syscalls and exceptions.
+pub(crate) fn ctl_filter_syscall_exceptions<F: Field>() -> Filter<F> {
+    Filter::new_simple(Column::sum([COL_MAP.op.syscall, COL_MAP.op.exception]))
+}
+
+/// Creates the vector of `Columns` corresponding to the contents of the CPU registers when performing a `PUSH`.
+/// `PUSH` internal reads are done by calling `BytePackingStark`.
+pub(crate) fn ctl_data_byte_packing_push<F: Field>() -> Vec<Column<F>> {
+    let is_read = Column::constant(F::ONE);
+    let context = Column::single(COL_MAP.code_context);
+    let segment = Column::constant(F::from_canonical_usize(Segment::Code as usize));
+    // The initial offset if `pc + 1`.
+    let virt =
+        Column::linear_combination_with_constant([(COL_MAP.program_counter, F::ONE)], F::ONE);
+    let val = Column::singles_next_row(COL_MAP.mem_channels[0].value);
+
+    // We fetch the length from the `PUSH` opcode lower bits, that indicate `len - 1`.
+    let len = Column::le_bits_with_constant(&COL_MAP.opcode_bits[0..5], F::ONE);
 
     let num_channels = F::from_canonical_usize(NUM_CHANNELS);
     let timestamp = Column::linear_combination([(COL_MAP.clock, num_channels)]);
 
-    let mut res = vec![context, segment, virt, len, timestamp];
+    let mut res = vec![is_read, context, segment, virt, len, timestamp];
     res.extend(val);
 
     res
 }
 
-pub fn ctl_filter_byte_unpacking<F: Field>() -> Column<F> {
-    Column::single(COL_MAP.op.mstore_32bytes)
+/// CTL filter for the `PUSH` operation.
+pub(crate) fn ctl_filter_byte_packing_push<F: Field>() -> Filter<F> {
+    let bit_col = Column::single(COL_MAP.opcode_bits[5]);
+    Filter::new(
+        vec![(Column::single(COL_MAP.op.push_prover_input), bit_col)],
+        vec![],
+    )
 }
 
-pub const MEM_CODE_CHANNEL_IDX: usize = 0;
-pub const MEM_GP_CHANNELS_IDX_START: usize = MEM_CODE_CHANNEL_IDX + 1;
+/// Index of the memory channel storing code.
+pub(crate) const MEM_CODE_CHANNEL_IDX: usize = 0;
+/// Index of the first general purpose memory channel.
+pub(crate) const MEM_GP_CHANNELS_IDX_START: usize = MEM_CODE_CHANNEL_IDX + 1;
+
+/// Recover the three components of an address, given a CPU row and
+/// a provided memory channel index.
+/// The components are recovered as follows:
+///
+/// - `context`, shifted by 2^64 (i.e. at index 2)
+/// - `segment`, shifted by 2^32 (i.e. at index 1)
+/// - `virtual`, not shifted (i.e. at index 0)
+pub(crate) const fn get_addr<T: Copy>(lv: &CpuColumnsView<T>, mem_channel: usize) -> (T, T, T) {
+    let addr_context = lv.mem_channels[mem_channel].value[2];
+    let addr_segment = lv.mem_channels[mem_channel].value[1];
+    let addr_virtual = lv.mem_channels[mem_channel].value[0];
+    (addr_context, addr_segment, addr_virtual)
+}
 
 /// Make the time/channel column for memory lookups.
 fn mem_time_and_channel<F: Field>(channel: usize) -> Column<F> {
@@ -150,12 +283,13 @@ fn mem_time_and_channel<F: Field>(channel: usize) -> Column<F> {
     Column::linear_combination_with_constant([(COL_MAP.clock, scalar)], addend)
 }
 
-pub fn ctl_data_code_memory<F: Field>() -> Vec<Column<F>> {
+/// Creates the vector of `Columns` corresponding to the contents of the code channel when reading code values.
+pub(crate) fn ctl_data_code_memory<F: Field>() -> Vec<Column<F>> {
     let mut cols = vec![
-        Column::constant(F::ONE),                                      // is_read
-        Column::single(COL_MAP.code_context),                          // addr_context
-        Column::constant(F::from_canonical_u64(Segment::Code as u64)), // addr_segment
-        Column::single(COL_MAP.program_counter),                       // addr_virtual
+        Column::constant(F::ONE),             // is_read
+        Column::single(COL_MAP.code_context), // addr_context
+        Column::constant(F::from_canonical_usize(Segment::Code.unscale())), // addr_segment
+        Column::single(COL_MAP.program_counter), // addr_virtual
     ];
 
     // Low limb of the value matches the opcode bits
@@ -169,7 +303,8 @@ pub fn ctl_data_code_memory<F: Field>() -> Vec<Column<F>> {
     cols
 }
 
-pub fn ctl_data_gp_memory<F: Field>(channel: usize) -> Vec<Column<F>> {
+/// Creates the vector of `Columns` corresponding to the contents of General Purpose channels.
+pub(crate) fn ctl_data_gp_memory<F: Field>(channel: usize) -> Vec<Column<F>> {
     let channel_map = COL_MAP.mem_channels[channel];
     let mut cols: Vec<_> = Column::singles([
         channel_map.is_read,
@@ -186,16 +321,133 @@ pub fn ctl_data_gp_memory<F: Field>(channel: usize) -> Vec<Column<F>> {
     cols
 }
 
-pub fn ctl_filter_code_memory<F: Field>() -> Column<F> {
-    Column::sum(COL_MAP.op.iter())
+pub(crate) fn ctl_data_partial_memory<F: Field>() -> Vec<Column<F>> {
+    let channel_map = COL_MAP.partial_channel;
+    let values = COL_MAP.mem_channels[0].value;
+    let mut cols: Vec<_> = Column::singles([
+        channel_map.is_read,
+        channel_map.addr_context,
+        channel_map.addr_segment,
+        channel_map.addr_virtual,
+    ])
+    .collect();
+
+    cols.extend(Column::singles(values));
+
+    cols.push(mem_time_and_channel(
+        MEM_GP_CHANNELS_IDX_START + NUM_GP_CHANNELS,
+    ));
+
+    cols
+}
+
+/// Old stack pointer write for SET_CONTEXT.
+pub(crate) fn ctl_data_memory_old_sp_write_set_context<F: Field>() -> Vec<Column<F>> {
+    let mut cols = vec![
+        Column::constant(F::ZERO),       // is_read
+        Column::single(COL_MAP.context), // addr_context
+        Column::constant(F::from_canonical_usize(Segment::ContextMetadata.unscale())), // addr_segment
+        Column::constant(F::from_canonical_usize(
+            ContextMetadata::StackSize.unscale(),
+        )), // addr_virtual
+    ];
+
+    // Low limb is current stack length minus one.
+    cols.push(Column::linear_combination_with_constant(
+        [(COL_MAP.stack_len, F::ONE)],
+        -F::ONE,
+    ));
+
+    // High limbs of the value are all zero.
+    cols.extend(repeat(Column::constant(F::ZERO)).take(VALUE_LIMBS - 1));
+
+    cols.push(mem_time_and_channel(MEM_GP_CHANNELS_IDX_START + 1));
+
+    cols
 }
 
-pub fn ctl_filter_gp_memory<F: Field>(channel: usize) -> Column<F> {
-    Column::single(COL_MAP.mem_channels[channel].used)
+/// New stack pointer read for SET_CONTEXT.
+pub(crate) fn ctl_data_memory_new_sp_read_set_context<F: Field>() -> Vec<Column<F>> {
+    let mut cols = vec![
+        Column::constant(F::ONE),                         // is_read
+        Column::single(COL_MAP.mem_channels[0].value[2]), // addr_context (in the top of the stack)
+        Column::constant(F::from_canonical_usize(Segment::ContextMetadata.unscale())), // addr_segment
+        Column::constant(F::from_canonical_u64(
+            ContextMetadata::StackSize as u64 - Segment::ContextMetadata as u64,
+        )), // addr_virtual
+    ];
+
+    // Low limb is new stack length.
+    cols.push(Column::single_next_row(COL_MAP.stack_len));
+
+    // High limbs of the value are all zero.
+    cols.extend(repeat(Column::constant(F::ZERO)).take(VALUE_LIMBS - 1));
+
+    cols.push(mem_time_and_channel(MEM_GP_CHANNELS_IDX_START + 2));
+
+    cols
+}
+
+/// CTL filter for code read and write operations.
+pub(crate) fn ctl_filter_code_memory<F: Field>() -> Filter<F> {
+    Filter::new_simple(Column::sum(COL_MAP.op.iter()))
+}
+
+/// CTL filter for General Purpose memory read and write operations.
+pub(crate) fn ctl_filter_gp_memory<F: Field>(channel: usize) -> Filter<F> {
+    Filter::new_simple(Column::single(COL_MAP.mem_channels[channel].used))
+}
+
+pub(crate) fn ctl_filter_partial_memory<F: Field>() -> Filter<F> {
+    Filter::new_simple(Column::single(COL_MAP.partial_channel.used))
+}
+
+/// CTL filter for the `SET_CONTEXT` operation.
+/// SET_CONTEXT is differentiated from GET_CONTEXT by its zeroth bit set to 1
+pub(crate) fn ctl_filter_set_context<F: Field>() -> Filter<F> {
+    Filter::new(
+        vec![(
+            Column::single(COL_MAP.op.context_op),
+            Column::single(COL_MAP.opcode_bits[0]),
+        )],
+        vec![],
+    )
+}
+
+/// Disable the specified memory channels.
+/// Since channel 0 contains the top of the stack and is handled specially,
+/// channels to disable are 1, 2 or both. All cases can be expressed as a vec.
+pub(crate) fn disable_unused_channels<P: PackedField>(
+    lv: &CpuColumnsView<P>,
+    filter: P,
+    channels: Vec<usize>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    for i in channels {
+        yield_constr.constraint(filter * lv.mem_channels[i].used);
+    }
+}
+
+/// Circuit version of `disable_unused_channels`.
+/// Disable the specified memory channels.
+/// Since channel 0 contains the top of the stack and is handled specially,
+/// channels to disable are 1, 2 or both. All cases can be expressed as a vec.
+pub(crate) fn disable_unused_channels_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &CpuColumnsView<ExtensionTarget<D>>,
+    filter: ExtensionTarget<D>,
+    channels: Vec<usize>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    for i in channels {
+        let constr = builder.mul_extension(filter, lv.mem_channels[i].used);
+        yield_constr.constraint(builder, constr);
+    }
 }
 
+/// Structure representing the CPU Stark.
 #[derive(Copy, Clone, Default)]
-pub struct CpuStark<F, const D: usize> {
+pub(crate) struct CpuStark<F, const D: usize> {
     pub f: PhantomData<F>,
 }
 
@@ -207,6 +459,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
 
     type EvaluationFrameTarget = StarkFrame<ExtensionTarget<D>, NUM_CPU_COLUMNS>;
 
+    /// Evaluates all CPU constraints.
     fn eval_packed_generic<FE, P, const D2: usize>(
         &self,
         vars: &Self::EvaluationFrame<FE, P, D2>,
@@ -220,7 +473,8 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
         let next_values: &[P; NUM_CPU_COLUMNS] = vars.get_next_values().try_into().unwrap();
         let next_values: &CpuColumnsView<P> = next_values.borrow();
 
-        bootstrap_kernel::eval_bootstrap_kernel_packed(local_values, next_values, yield_constr);
+        byte_unpacking::eval_packed(local_values, next_values, yield_constr);
+        clock::eval_packed(local_values, next_values, yield_constr);
         contextops::eval_packed(local_values, next_values, yield_constr);
         control_flow::eval_packed_generic(local_values, next_values, yield_constr);
         decode::eval_packed_generic(local_values, yield_constr);
@@ -236,10 +490,11 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
         shift::eval_packed(local_values, yield_constr);
         simple_logic::eval_packed(local_values, next_values, yield_constr);
         stack::eval_packed(local_values, next_values, yield_constr);
-        stack_bounds::eval_packed(local_values, yield_constr);
         syscalls_exceptions::eval_packed(local_values, next_values, yield_constr);
     }
 
+    /// Circuit version of `eval_packed_generic`.
+    /// Evaluates all CPU constraints.
     fn eval_ext_circuit(
         &self,
         builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
@@ -253,12 +508,8 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
             vars.get_next_values().try_into().unwrap();
         let next_values: &CpuColumnsView<ExtensionTarget<D>> = next_values.borrow();
 
-        bootstrap_kernel::eval_bootstrap_kernel_ext_circuit(
-            builder,
-            local_values,
-            next_values,
-            yield_constr,
-        );
+        byte_unpacking::eval_ext_circuit(builder, local_values, next_values, yield_constr);
+        clock::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         contextops::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         control_flow::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         decode::eval_ext_circuit(builder, local_values, yield_constr);
@@ -274,7 +525,6 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
         shift::eval_ext_circuit(builder, local_values, yield_constr);
         simple_logic::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         stack::eval_ext_circuit(builder, local_values, next_values, yield_constr);
-        stack_bounds::eval_ext_circuit(builder, local_values, yield_constr);
         syscalls_exceptions::eval_ext_circuit(builder, local_values, next_values, yield_constr);
     }
 
diff --git a/evm/src/cpu/decode.rs b/evm/src/cpu/decode.rs
index a4756684a2..4c2c43221e 100644
--- a/evm/src/cpu/decode.rs
+++ b/evm/src/cpu/decode.rs
@@ -23,28 +23,19 @@ use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 /// behavior.
 /// Note: invalid opcodes are not represented here. _Any_ opcode is permitted to decode to
 /// `is_invalid`. The kernel then verifies that the opcode was _actually_ invalid.
-const OPCODES: [(u8, usize, bool, usize); 16] = [
+const OPCODES: [(u8, usize, bool, usize); 5] = [
     // (start index of block, number of top bits to check (log2), kernel-only, flag column)
     // ADD, MUL, SUB, DIV, MOD, LT, GT and BYTE flags are handled partly manually here, and partly through the Arithmetic table CTL.
     // ADDMOD, MULMOD and SUBMOD flags are handled partly manually here, and partly through the Arithmetic table CTL.
     // FP254 operation flags are handled partly manually here, and partly through the Arithmetic table CTL.
     (0x14, 1, false, COL_MAP.op.eq_iszero),
     // AND, OR and XOR flags are handled partly manually here, and partly through the Logic table CTL.
-    (0x19, 0, false, COL_MAP.op.not),
+    // NOT and POP are handled manually here.
     // SHL and SHR flags are handled partly manually here, and partly through the Logic table CTL.
-    (0x21, 0, true, COL_MAP.op.keccak_general),
-    (0x49, 0, true, COL_MAP.op.prover_input),
-    (0x50, 0, false, COL_MAP.op.pop),
-    (0x56, 1, false, COL_MAP.op.jumps), // 0x56-0x57
-    (0x58, 0, false, COL_MAP.op.pc),
-    (0x5b, 0, false, COL_MAP.op.jumpdest),
-    (0x5f, 0, false, COL_MAP.op.push0),
-    (0x60, 5, false, COL_MAP.op.push),     // 0x60-0x7f
-    (0x80, 5, false, COL_MAP.op.dup_swap), // 0x80-0x9f
-    (0xee, 0, true, COL_MAP.op.mstore_32bytes),
-    (0xf6, 0, true, COL_MAP.op.get_context),
-    (0xf7, 0, true, COL_MAP.op.set_context),
-    (0xf8, 0, true, COL_MAP.op.mload_32bytes),
+    // JUMPDEST and KECCAK_GENERAL are handled manually here.
+    (0x56, 1, false, COL_MAP.op.jumps),     // 0x56-0x57
+    (0x80, 5, false, COL_MAP.op.dup_swap),  // 0x80-0x9f
+    (0xf6, 1, true, COL_MAP.op.context_op), //0xf6-0xf7
     (0xf9, 0, true, COL_MAP.op.exit_kernel),
     // MLOAD_GENERAL and MSTORE_GENERAL flags are handled manually here.
 ];
@@ -52,13 +43,18 @@ const OPCODES: [(u8, usize, bool, usize); 16] = [
 /// List of combined opcodes requiring a special handling.
 /// Each index in the list corresponds to an arbitrary combination
 /// of opcodes defined in evm/src/cpu/columns/ops.rs.
-const COMBINED_OPCODES: [usize; 6] = [
+const COMBINED_OPCODES: [usize; 11] = [
     COL_MAP.op.logic_op,
     COL_MAP.op.fp254_op,
     COL_MAP.op.binary_op,
     COL_MAP.op.ternary_op,
     COL_MAP.op.shift,
     COL_MAP.op.m_op_general,
+    COL_MAP.op.jumpdest_keccak_general,
+    COL_MAP.op.not_pop,
+    COL_MAP.op.pc_push0,
+    COL_MAP.op.m_op_32bytes,
+    COL_MAP.op.push_prover_input,
 ];
 
 /// Break up an opcode (which is 8 bits long) into its eight bits.
@@ -75,7 +71,8 @@ const fn bits_from_opcode(opcode: u8) -> [bool; 8] {
     ]
 }
 
-pub fn eval_packed_generic<P: PackedField>(
+/// Evaluates the constraints for opcode decoding.
+pub(crate) fn eval_packed_generic<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
@@ -134,22 +131,96 @@ pub fn eval_packed_generic<P: PackedField>(
         yield_constr.constraint(lv[col] * (unavailable + opcode_mismatch));
     }
 
+    let opcode_high_bits = |num_high_bits| -> P {
+        lv.opcode_bits
+            .into_iter()
+            .enumerate()
+            .rev()
+            .take(num_high_bits)
+            .map(|(i, bit)| bit * P::Scalar::from_canonical_u64(1 << i))
+            .sum()
+    };
+
     // Manually check lv.op.m_op_constr
-    let opcode: P = lv
-        .opcode_bits
-        .into_iter()
-        .enumerate()
-        .map(|(i, bit)| bit * P::Scalar::from_canonical_u64(1 << i))
-        .sum();
+    let opcode = opcode_high_bits(8);
     yield_constr.constraint((P::ONES - kernel_mode) * lv.op.m_op_general);
 
     let m_op_constr = (opcode - P::Scalar::from_canonical_usize(0xfb_usize))
         * (opcode - P::Scalar::from_canonical_usize(0xfc_usize))
         * lv.op.m_op_general;
     yield_constr.constraint(m_op_constr);
+
+    // Manually check lv.op.jumpdest_keccak_general.
+    // KECCAK_GENERAL is a kernel-only instruction, but not JUMPDEST.
+    // JUMPDEST is differentiated from KECCAK_GENERAL by its second bit set to 1.
+    yield_constr.constraint(
+        (P::ONES - kernel_mode) * lv.op.jumpdest_keccak_general * (P::ONES - lv.opcode_bits[1]),
+    );
+
+    // Check the JUMPDEST and KERNEL_GENERAL opcodes.
+    let jumpdest_opcode = P::Scalar::from_canonical_usize(0x5b);
+    let keccak_general_opcode = P::Scalar::from_canonical_usize(0x21);
+    let jumpdest_keccak_general_constr = (opcode - keccak_general_opcode)
+        * (opcode - jumpdest_opcode)
+        * lv.op.jumpdest_keccak_general;
+    yield_constr.constraint(jumpdest_keccak_general_constr);
+
+    // Manually check lv.op.pc_push0.
+    // Both PC and PUSH0 can be called outside of the kernel mode:
+    // there is no need to constrain them in that regard.
+    let pc_push0_constr = (opcode - P::Scalar::from_canonical_usize(0x58_usize))
+        * (opcode - P::Scalar::from_canonical_usize(0x5f_usize))
+        * lv.op.pc_push0;
+    yield_constr.constraint(pc_push0_constr);
+
+    // Manually check lv.op.not_pop.
+    // Both NOT and POP can be called outside of the kernel mode:
+    // there is no need to constrain them in that regard.
+    let not_pop_op = (opcode - P::Scalar::from_canonical_usize(0x19_usize))
+        * (opcode - P::Scalar::from_canonical_usize(0x50_usize))
+        * lv.op.not_pop;
+    yield_constr.constraint(not_pop_op);
+
+    // Manually check lv.op.m_op_32bytes.
+    // Both are kernel-only.
+    yield_constr.constraint((P::ONES - kernel_mode) * lv.op.m_op_32bytes);
+
+    // Check the MSTORE_32BYTES and MLOAD-32BYTES opcodes.
+    let opcode_high_three = opcode_high_bits(3);
+    let op_32bytes = (opcode_high_three - P::Scalar::from_canonical_usize(0xc0_usize))
+        * (opcode - P::Scalar::from_canonical_usize(0xf8_usize))
+        * lv.op.m_op_32bytes;
+    yield_constr.constraint(op_32bytes);
+
+    // Manually check PUSH and PROVER_INPUT.
+    // PROVER_INPUT is a kernel-only instruction, but not PUSH.
+    let push_prover_input_constr = (opcode - P::Scalar::from_canonical_usize(0x49_usize))
+        * (opcode_high_three - P::Scalar::from_canonical_usize(0x60_usize))
+        * lv.op.push_prover_input;
+    yield_constr.constraint(push_prover_input_constr);
+    let prover_input_constr =
+        lv.op.push_prover_input * (lv.opcode_bits[5] - P::ONES) * (P::ONES - kernel_mode);
+    yield_constr.constraint(prover_input_constr);
+}
+
+fn opcode_high_bits_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &CpuColumnsView<ExtensionTarget<D>>,
+    num_high_bits: usize,
+) -> ExtensionTarget<D> {
+    lv.opcode_bits
+        .into_iter()
+        .enumerate()
+        .rev()
+        .take(num_high_bits)
+        .fold(builder.zero_extension(), |cumul, (i, bit)| {
+            builder.mul_const_add_extension(F::from_canonical_usize(1 << i), bit, cumul)
+        })
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed_generic`.
+/// Evaluates the constraints for opcode decoding.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
@@ -227,13 +298,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     }
 
     // Manually check lv.op.m_op_constr
-    let opcode = lv
-        .opcode_bits
-        .into_iter()
-        .rev()
-        .fold(builder.zero_extension(), |cumul, bit| {
-            builder.mul_const_add_extension(F::TWO, cumul, bit)
-        });
+    let opcode = opcode_high_bits_circuit(builder, lv, 8);
 
     let mload_opcode = builder.constant_extension(F::Extension::from_canonical_usize(0xfb_usize));
     let mstore_opcode = builder.constant_extension(F::Extension::from_canonical_usize(0xfc_usize));
@@ -249,4 +314,92 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     m_op_constr = builder.mul_extension(m_op_constr, lv.op.m_op_general);
 
     yield_constr.constraint(builder, m_op_constr);
+
+    // Manually check lv.op.jumpdest_keccak_general.
+    // KECCAK_GENERAL is a kernel-only instruction, but not JUMPDEST.
+    // JUMPDEST is differentiated from KECCAK_GENERAL by its second bit set to 1.
+    let jumpdest_opcode =
+        builder.constant_extension(F::Extension::from_canonical_usize(0x5b_usize));
+    let keccak_general_opcode =
+        builder.constant_extension(F::Extension::from_canonical_usize(0x21_usize));
+
+    // Check that KECCAK_GENERAL is kernel-only.
+    let mut kernel_general_filter = builder.sub_extension(one, lv.opcode_bits[1]);
+    kernel_general_filter =
+        builder.mul_extension(lv.op.jumpdest_keccak_general, kernel_general_filter);
+    let constr = builder.mul_extension(is_not_kernel_mode, kernel_general_filter);
+    yield_constr.constraint(builder, constr);
+
+    // Check the JUMPDEST and KERNEL_GENERAL opcodes.
+    let jumpdest_constr = builder.sub_extension(opcode, jumpdest_opcode);
+    let keccak_general_constr = builder.sub_extension(opcode, keccak_general_opcode);
+    let mut jumpdest_keccak_general_constr =
+        builder.mul_extension(jumpdest_constr, keccak_general_constr);
+    jumpdest_keccak_general_constr = builder.mul_extension(
+        jumpdest_keccak_general_constr,
+        lv.op.jumpdest_keccak_general,
+    );
+
+    yield_constr.constraint(builder, jumpdest_keccak_general_constr);
+
+    // Manually check lv.op.pc_push0.
+    // Both PC and PUSH0 can be called outside of the kernel mode:
+    // there is no need to constrain them in that regard.
+    let pc_opcode = builder.constant_extension(F::Extension::from_canonical_usize(0x58_usize));
+    let push0_opcode = builder.constant_extension(F::Extension::from_canonical_usize(0x5f_usize));
+    let pc_constr = builder.sub_extension(opcode, pc_opcode);
+    let push0_constr = builder.sub_extension(opcode, push0_opcode);
+    let mut pc_push0_constr = builder.mul_extension(pc_constr, push0_constr);
+    pc_push0_constr = builder.mul_extension(pc_push0_constr, lv.op.pc_push0);
+    yield_constr.constraint(builder, pc_push0_constr);
+
+    // Manually check lv.op.not_pop.
+    // Both NOT and POP can be called outside of the kernel mode:
+    // there is no need to constrain them in that regard.
+    let not_opcode = builder.constant_extension(F::Extension::from_canonical_usize(0x19_usize));
+    let pop_opcode = builder.constant_extension(F::Extension::from_canonical_usize(0x50_usize));
+
+    let not_constr = builder.sub_extension(opcode, not_opcode);
+    let pop_constr = builder.sub_extension(opcode, pop_opcode);
+
+    let mut not_pop_constr = builder.mul_extension(not_constr, pop_constr);
+    not_pop_constr = builder.mul_extension(lv.op.not_pop, not_pop_constr);
+    yield_constr.constraint(builder, not_pop_constr);
+
+    // Manually check lv.op.m_op_32bytes.
+    // Both are kernel-only.
+    let constr = builder.mul_extension(is_not_kernel_mode, lv.op.m_op_32bytes);
+    yield_constr.constraint(builder, constr);
+
+    // Check the MSTORE_32BYTES and MLOAD-32BYTES opcodes.
+    let opcode_high_three = opcode_high_bits_circuit(builder, lv, 3);
+    let mstore_32bytes_opcode =
+        builder.constant_extension(F::Extension::from_canonical_usize(0xc0_usize));
+    let mload_32bytes_opcode =
+        builder.constant_extension(F::Extension::from_canonical_usize(0xf8_usize));
+    let mstore_32bytes_constr = builder.sub_extension(opcode_high_three, mstore_32bytes_opcode);
+    let mload_32bytes_constr = builder.sub_extension(opcode, mload_32bytes_opcode);
+    let constr = builder.mul_extension(mstore_32bytes_constr, mload_32bytes_constr);
+    let constr = builder.mul_extension(constr, lv.op.m_op_32bytes);
+    yield_constr.constraint(builder, constr);
+
+    // Manually check PUSH and PROVER_INPUT.
+    // PROVER_INPUT is a kernel-only instruction, but not PUSH.
+    let prover_input_opcode =
+        builder.constant_extension(F::Extension::from_canonical_usize(0x49usize));
+    let push_opcodes = builder.constant_extension(F::Extension::from_canonical_usize(0x60usize));
+
+    let push_constr = builder.sub_extension(opcode_high_three, push_opcodes);
+    let prover_input_constr = builder.sub_extension(opcode, prover_input_opcode);
+
+    let push_prover_input_constr =
+        builder.mul_many_extension([lv.op.push_prover_input, prover_input_constr, push_constr]);
+    yield_constr.constraint(builder, push_prover_input_constr);
+    let prover_input_filter = builder.mul_sub_extension(
+        lv.op.push_prover_input,
+        lv.opcode_bits[5],
+        lv.op.push_prover_input,
+    );
+    let constr = builder.mul_extension(prover_input_filter, is_not_kernel_mode);
+    yield_constr.constraint(builder, constr);
 }
diff --git a/evm/src/cpu/docs/out-of-gas.md b/evm/src/cpu/docs/out-of-gas.md
deleted file mode 100644
index 733384b27e..0000000000
--- a/evm/src/cpu/docs/out-of-gas.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Out of Gas Errors
-
-The CPU table has a `gas` register that keeps track of the gas used by the transaction so far.
-
-The crucial invariant in our out-of-gas checking method is that at any point in the program's execution, we have not used more gas than we have available; that is `gas` is at most the gas allocation for the transaction (which is stored separately by the kernel). We assume that the gas allocation will never be 2^32 or more, so if `gas` does not fit in one limb, then we've run out of gas.
-
-When a native instruction (one that is not a syscall) is executed, a constraint ensures that the `gas` register is increased by the correct amount. This is not automatic for syscalls; the syscall handler itself must calculate and charge the appropriate amount.
-
-If everything goes smoothly and we have not run out of gas, `gas` should be no more than the gas allowance at the point that we `STOP`, `REVERT`, stack overflow, or whatever. Indeed, because we assume that the gas overflow handler is invoked _as soon as_ we've run out of gas, all these termination methods must verify that `gas` <= allowance, and `PANIC` if this is not the case. This is also true for the out-of-gas handler, which should check that (a) we have not yet run out of gas and (b) we are about to run out of gas, `PANIC`king if either of those does not hold.
-
-When we do run out of gas, however, this event must be handled. Syscalls are responsible for checking that their execution would not cause the transaction to run out of gas. If the syscall detects that it would need to charge more gas than available, it must abort the transaction by jumping to `exc_out_of_gas`, which in turn verifies that the out-of-gas hasn't _already_ occured.
-
-Native instructions do this differently. If the prover notices that execution of the instruction would cause an out-of-gas error, it must jump to the appropriate handler instead of executing the instruction. (The handler contains special code that `PANIC`s if the prover invoked it incorrectly.)
-
-## Overflow
-
-We must be careful to ensure that `gas` does not overflow to prevent denial of service attacks.
-
-Note that a syscall cannot be the instruction that causes an overflow. This is because every syscall is required to verify that its execution does not cause us to exceed the gas limit. Upon entry into a syscall, a constraint verifies that `gas` < 2^32. Some syscalls may have to be careful to ensure that the gas check is performed correctly (for example, that overflow modulo 2^256 does not occur). So we can assume that upon entry and exit out of a syscall, `gas` < 2^32.
-
-Similarly, native instructions alone cannot cause wraparound. The most expensive instruction, `JUMPI`, costs 10 gas. Even if we were to execute 2^32 consecutive `JUMPI` instructions, the maximum length of a trace, we are nowhere close to consuming 2^64 - 2^32 + 1 (= Golilocks prime) gas.
-
-The final scenario we must tackle is an expensive syscall followed by many expensive native instructions. Upon exit from a syscall, `gas` < 2^32. Again, even if that syscall is followed by 2^32 native instructions of cost 10, we do not see wraparound modulo Goldilocks.
diff --git a/evm/src/cpu/dup_swap.rs b/evm/src/cpu/dup_swap.rs
index 0cc6c67c8f..1abec5fc61 100644
--- a/evm/src/cpu/dup_swap.rs
+++ b/evm/src/cpu/dup_swap.rs
@@ -53,7 +53,7 @@ fn constrain_channel_packed<P: PackedField>(
     yield_constr.constraint(filter * (channel.is_read - P::Scalar::from_bool(is_read)));
     yield_constr.constraint(filter * (channel.addr_context - lv.context));
     yield_constr.constraint(
-        filter * (channel.addr_segment - P::Scalar::from_canonical_u64(Segment::Stack as u64)),
+        filter * (channel.addr_segment - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
     );
     // Top of the stack is at `addr = lv.stack_len - 1`.
     let addr_virtual = lv.stack_len - P::ONES - offset;
@@ -93,13 +93,14 @@ fn constrain_channel_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     {
         let constr = builder.arithmetic_extension(
             F::ONE,
-            -F::from_canonical_u64(Segment::Stack as u64),
+            -F::from_canonical_usize(Segment::Stack.unscale()),
             filter,
             channel.addr_segment,
             filter,
         );
         yield_constr.constraint(builder, constr);
     }
+    // Top of the stack is at `addr = lv.stack_len - 1`.
     {
         let constr = builder.add_extension(channel.addr_virtual, offset);
         let constr = builder.sub_extension(constr, lv.stack_len);
@@ -108,6 +109,7 @@ fn constrain_channel_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     }
 }
 
+/// Evaluates constraints for DUP.
 fn eval_packed_dup<P: PackedField>(
     n: P,
     lv: &CpuColumnsView<P>,
@@ -120,18 +122,25 @@ fn eval_packed_dup<P: PackedField>(
     let write_channel = &lv.mem_channels[1];
     let read_channel = &lv.mem_channels[2];
 
+    // Constrain the input and top of the stack channels to have the same value.
     channels_equal_packed(filter, write_channel, &lv.mem_channels[0], yield_constr);
+    // Constrain the output channel's addresses, `is_read` and `used` fields.
     constrain_channel_packed(false, filter, P::ZEROS, write_channel, lv, yield_constr);
 
+    // Constrain the output and top of the stack channels to have the same value.
     channels_equal_packed(filter, read_channel, &nv.mem_channels[0], yield_constr);
+    // Constrain the input channel's addresses, `is_read` and `used` fields.
     constrain_channel_packed(true, filter, n, read_channel, lv, yield_constr);
 
     // Constrain nv.stack_len.
     yield_constr.constraint_transition(filter * (nv.stack_len - lv.stack_len - P::ONES));
 
-    // TODO: Constrain unused channels?
+    // Disable next top.
+    yield_constr.constraint(filter * nv.mem_channels[0].used);
 }
 
+/// Circuit version of `eval_packed_dup`.
+/// Evaluates constraints for DUP.
 fn eval_ext_circuit_dup<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     n: ExtensionTarget<D>,
@@ -148,6 +157,7 @@ fn eval_ext_circuit_dup<F: RichField + Extendable<D>, const D: usize>(
     let write_channel = &lv.mem_channels[1];
     let read_channel = &lv.mem_channels[2];
 
+    // Constrain the input and top of the stack channels to have the same value.
     channels_equal_ext_circuit(
         builder,
         filter,
@@ -155,6 +165,7 @@ fn eval_ext_circuit_dup<F: RichField + Extendable<D>, const D: usize>(
         &lv.mem_channels[0],
         yield_constr,
     );
+    // Constrain the output channel's addresses, `is_read` and `used` fields.
     constrain_channel_ext_circuit(
         builder,
         false,
@@ -165,6 +176,7 @@ fn eval_ext_circuit_dup<F: RichField + Extendable<D>, const D: usize>(
         yield_constr,
     );
 
+    // Constrain the output and top of the stack channels to have the same value.
     channels_equal_ext_circuit(
         builder,
         filter,
@@ -172,16 +184,24 @@ fn eval_ext_circuit_dup<F: RichField + Extendable<D>, const D: usize>(
         &nv.mem_channels[0],
         yield_constr,
     );
+    // Constrain the input channel's addresses, `is_read` and `used` fields.
     constrain_channel_ext_circuit(builder, true, filter, n, read_channel, lv, yield_constr);
 
     // Constrain nv.stack_len.
-    let diff = builder.sub_extension(nv.stack_len, lv.stack_len);
-    let constr = builder.mul_sub_extension(filter, diff, filter);
-    yield_constr.constraint_transition(builder, constr);
+    {
+        let diff = builder.sub_extension(nv.stack_len, lv.stack_len);
+        let constr = builder.mul_sub_extension(filter, diff, filter);
+        yield_constr.constraint_transition(builder, constr);
+    }
 
-    // TODO: Constrain unused channels?
+    // Disable next top.
+    {
+        let constr = builder.mul_extension(filter, nv.mem_channels[0].used);
+        yield_constr.constraint(builder, constr);
+    }
 }
 
+/// Evaluates constraints for SWAP.
 fn eval_packed_swap<P: PackedField>(
     n: P,
     lv: &CpuColumnsView<P>,
@@ -197,18 +217,26 @@ fn eval_packed_swap<P: PackedField>(
     let in2_channel = &lv.mem_channels[1];
     let out_channel = &lv.mem_channels[2];
 
+    // Constrain the first input channel value to be equal to the output channel value.
     channels_equal_packed(filter, in1_channel, out_channel, yield_constr);
+    // We set `is_read`, `used` and the address for the first input. The first input is
+    // read from the top of the stack, and is therefore not a memory read.
     constrain_channel_packed(false, filter, n_plus_one, out_channel, lv, yield_constr);
 
+    // Constrain the second input channel value to be equal to the new top of the stack.
     channels_equal_packed(filter, in2_channel, &nv.mem_channels[0], yield_constr);
+    // We set `is_read`, `used` and the address for the second input.
     constrain_channel_packed(true, filter, n_plus_one, in2_channel, lv, yield_constr);
 
-    // Constrain nv.stack_len;
+    // Constrain nv.stack_len.
     yield_constr.constraint(filter * (nv.stack_len - lv.stack_len));
 
-    // TODO: Constrain unused channels?
+    // Disable next top.
+    yield_constr.constraint(filter * nv.mem_channels[0].used);
 }
 
+/// Circuit version of `eval_packed_swap`.
+/// Evaluates constraints for SWAP.
 fn eval_ext_circuit_swap<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     n: ExtensionTarget<D>,
@@ -226,7 +254,10 @@ fn eval_ext_circuit_swap<F: RichField + Extendable<D>, const D: usize>(
     let in2_channel = &lv.mem_channels[1];
     let out_channel = &lv.mem_channels[2];
 
+    // Constrain the first input channel value to be equal to the output channel value.
     channels_equal_ext_circuit(builder, filter, in1_channel, out_channel, yield_constr);
+    // We set `is_read`, `used` and the address for the first input. The first input is
+    // read from the top of the stack, and is therefore not a memory read.
     constrain_channel_ext_circuit(
         builder,
         false,
@@ -237,6 +268,7 @@ fn eval_ext_circuit_swap<F: RichField + Extendable<D>, const D: usize>(
         yield_constr,
     );
 
+    // Constrain the second input channel value to be equal to the new top of the stack.
     channels_equal_ext_circuit(
         builder,
         filter,
@@ -244,6 +276,7 @@ fn eval_ext_circuit_swap<F: RichField + Extendable<D>, const D: usize>(
         &nv.mem_channels[0],
         yield_constr,
     );
+    // We set `is_read`, `used` and the address for the second input.
     constrain_channel_ext_circuit(
         builder,
         true,
@@ -259,10 +292,15 @@ fn eval_ext_circuit_swap<F: RichField + Extendable<D>, const D: usize>(
     let constr = builder.mul_extension(filter, diff);
     yield_constr.constraint(builder, constr);
 
-    // TODO: Constrain unused channels?
+    // Disable next top.
+    {
+        let constr = builder.mul_extension(filter, nv.mem_channels[0].used);
+        yield_constr.constraint(builder, constr);
+    }
 }
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates the constraints for the DUP and SWAP opcodes.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -274,9 +312,14 @@ pub fn eval_packed<P: PackedField>(
 
     eval_packed_dup(n, lv, nv, yield_constr);
     eval_packed_swap(n, lv, nv, yield_constr);
+
+    // For both, disable the partial channel.
+    yield_constr.constraint(lv.op.dup_swap * lv.partial_channel.used);
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates the constraints for the DUP and SWAP opcodes.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -291,4 +334,10 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
 
     eval_ext_circuit_dup(builder, n, lv, nv, yield_constr);
     eval_ext_circuit_swap(builder, n, lv, nv, yield_constr);
+
+    // For both, disable the partial channel.
+    {
+        let constr = builder.mul_extension(lv.op.dup_swap, lv.partial_channel.used);
+        yield_constr.constraint(builder, constr);
+    }
 }
diff --git a/evm/src/cpu/gas.rs b/evm/src/cpu/gas.rs
index 1a908d6df4..be033c3c43 100644
--- a/evm/src/cpu/gas.rs
+++ b/evm/src/cpu/gas.rs
@@ -24,21 +24,15 @@ const SIMPLE_OPCODES: OpsColumnsView<Option<u32>> = OpsColumnsView {
     fp254_op: KERNEL_ONLY_INSTR,
     eq_iszero: G_VERYLOW,
     logic_op: G_VERYLOW,
-    not: G_VERYLOW,
+    not_pop: None, // This is handled manually below
     shift: G_VERYLOW,
-    keccak_general: KERNEL_ONLY_INSTR,
-    prover_input: KERNEL_ONLY_INSTR,
-    pop: G_BASE,
-    jumps: None, // Combined flag handled separately.
-    pc: G_BASE,
-    jumpdest: G_JUMPDEST,
-    push0: G_BASE,
-    push: G_VERYLOW,
+    jumpdest_keccak_general: None, // This is handled manually below.
+    push_prover_input: None,       // This is handled manually below.
+    jumps: None,                   // Combined flag handled separately.
+    pc_push0: G_BASE,
     dup_swap: G_VERYLOW,
-    get_context: KERNEL_ONLY_INSTR,
-    set_context: KERNEL_ONLY_INSTR,
-    mstore_32bytes: KERNEL_ONLY_INSTR,
-    mload_32bytes: KERNEL_ONLY_INSTR,
+    context_op: KERNEL_ONLY_INSTR,
+    m_op_32bytes: KERNEL_ONLY_INSTR,
     exit_kernel: None,
     m_op_general: KERNEL_ONLY_INSTR,
     syscall: None,
@@ -70,15 +64,11 @@ fn eval_packed_accumulate<P: PackedField>(
         })
         .sum();
 
-    // TODO: This may cause soundness issue if the recomputed gas (as u64) overflows the field size.
-    // This is fine as we are only using two-limbs for testing purposes (to support all cases from
-    // the Ethereum test suite).
-    // This should be changed back to a single 32-bit limb before going into production!
-    let gas_diff = nv.gas[1] * P::Scalar::from_canonical_u64(1 << 32) + nv.gas[0]
-        - (lv.gas[1] * P::Scalar::from_canonical_u64(1 << 32) + lv.gas[0]);
-    let constr = gas_diff - gas_used;
+    let constr = nv.gas - (lv.gas + gas_used);
     yield_constr.constraint_transition(filter * constr);
 
+    let gas_diff = nv.gas - lv.gas;
+
     for (maybe_cost, op_flag) in izip!(SIMPLE_OPCODES.into_iter(), lv.op.into_iter()) {
         if let Some(cost) = maybe_cost {
             let cost = P::Scalar::from_canonical_u32(cost);
@@ -105,6 +95,30 @@ fn eval_packed_accumulate<P: PackedField>(
     let ternary_op_cost = P::Scalar::from_canonical_u32(G_MID.unwrap())
         - lv.opcode_bits[1] * P::Scalar::from_canonical_u32(G_MID.unwrap());
     yield_constr.constraint_transition(lv.op.ternary_op * (gas_diff - ternary_op_cost));
+
+    // For NOT and POP.
+    // NOT is differentiated from POP by its first bit set to 1.
+    let not_pop_cost = (P::ONES - lv.opcode_bits[0])
+        * P::Scalar::from_canonical_u32(G_BASE.unwrap())
+        + lv.opcode_bits[0] * P::Scalar::from_canonical_u32(G_VERYLOW.unwrap());
+    yield_constr.constraint_transition(lv.op.not_pop * (gas_diff - not_pop_cost));
+
+    // For JUMPDEST and KECCAK_GENERAL.
+    // JUMPDEST is differentiated from KECCAK_GENERAL by its second bit set to 1.
+    let jumpdest_keccak_general_gas_cost = lv.opcode_bits[1]
+        * P::Scalar::from_canonical_u32(G_JUMPDEST.unwrap())
+        + (P::ONES - lv.opcode_bits[1]) * P::Scalar::from_canonical_u32(KERNEL_ONLY_INSTR.unwrap());
+    yield_constr.constraint_transition(
+        lv.op.jumpdest_keccak_general * (gas_diff - jumpdest_keccak_general_gas_cost),
+    );
+
+    // For PROVER_INPUT and PUSH operations.
+    // PUSH operations are differentiated from PROVER_INPUT by their 6th bit set to 1.
+    let push_prover_input_gas_cost = lv.opcode_bits[5]
+        * P::Scalar::from_canonical_u32(G_VERYLOW.unwrap())
+        + (P::ONES - lv.opcode_bits[5]) * P::Scalar::from_canonical_u32(KERNEL_ONLY_INSTR.unwrap());
+    yield_constr
+        .constraint_transition(lv.op.push_prover_input * (gas_diff - push_prover_input_gas_cost));
 }
 
 fn eval_packed_init<P: PackedField>(
@@ -117,11 +131,11 @@ fn eval_packed_init<P: PackedField>(
     // `nv` is the first row that executes an instruction.
     let filter = (is_cpu_cycle - P::ONES) * is_cpu_cycle_next;
     // Set initial gas to zero.
-    yield_constr.constraint_transition(filter * nv.gas[0]);
-    yield_constr.constraint_transition(filter * nv.gas[1]);
+    yield_constr.constraint_transition(filter * nv.gas);
 }
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluate the gas constraints for the opcodes that cost a constant gas.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -161,22 +175,16 @@ fn eval_ext_circuit_accumulate<F: RichField + Extendable<D>, const D: usize>(
         },
     );
 
-    // TODO: This may cause soundness issue if the recomputed gas (as u64) overflows the field size.
-    // This is fine as we are only using two-limbs for testing purposes (to support all cases from
-    // the Ethereum test suite).
-    // This should be changed back to a single 32-bit limb before going into production!
-    let nv_gas =
-        builder.mul_const_add_extension(F::from_canonical_u64(1 << 32), nv.gas[1], nv.gas[0]);
-    let lv_gas =
-        builder.mul_const_add_extension(F::from_canonical_u64(1 << 32), lv.gas[1], lv.gas[0]);
-    let nv_lv_diff = builder.sub_extension(nv_gas, lv_gas);
-
-    let constr = builder.sub_extension(nv_lv_diff, gas_used);
+    let constr = {
+        let t = builder.add_extension(lv.gas, gas_used);
+        builder.sub_extension(nv.gas, t)
+    };
     let filtered_constr = builder.mul_extension(filter, constr);
     yield_constr.constraint_transition(builder, filtered_constr);
 
     for (maybe_cost, op_flag) in izip!(SIMPLE_OPCODES.into_iter(), lv.op.into_iter()) {
         if let Some(cost) = maybe_cost {
+            let nv_lv_diff = builder.sub_extension(nv.gas, lv.gas);
             let constr = builder.arithmetic_extension(
                 F::ONE,
                 -F::from_canonical_u32(cost),
@@ -197,6 +205,7 @@ fn eval_ext_circuit_accumulate<F: RichField + Extendable<D>, const D: usize>(
     let jump_gas_cost =
         builder.add_const_extension(jump_gas_cost, F::from_canonical_u32(G_MID.unwrap()));
 
+    let nv_lv_diff = builder.sub_extension(nv.gas, lv.gas);
     let gas_diff = builder.sub_extension(nv_lv_diff, jump_gas_cost);
     let constr = builder.mul_extension(filter, gas_diff);
     yield_constr.constraint_transition(builder, constr);
@@ -216,6 +225,7 @@ fn eval_ext_circuit_accumulate<F: RichField + Extendable<D>, const D: usize>(
     let binary_op_cost =
         builder.add_const_extension(binary_op_cost, F::from_canonical_u32(G_LOW.unwrap()));
 
+    let nv_lv_diff = builder.sub_extension(nv.gas, lv.gas);
     let gas_diff = builder.sub_extension(nv_lv_diff, binary_op_cost);
     let constr = builder.mul_extension(filter, gas_diff);
     yield_constr.constraint_transition(builder, constr);
@@ -230,9 +240,58 @@ fn eval_ext_circuit_accumulate<F: RichField + Extendable<D>, const D: usize>(
     let ternary_op_cost =
         builder.add_const_extension(ternary_op_cost, F::from_canonical_u32(G_MID.unwrap()));
 
+    let nv_lv_diff = builder.sub_extension(nv.gas, lv.gas);
     let gas_diff = builder.sub_extension(nv_lv_diff, ternary_op_cost);
     let constr = builder.mul_extension(filter, gas_diff);
     yield_constr.constraint_transition(builder, constr);
+
+    // For NOT and POP.
+    // NOT is differentiated from POP by its first bit set to 1.
+    let filter = lv.op.not_pop;
+    let one = builder.one_extension();
+    let mut not_pop_cost =
+        builder.mul_const_extension(F::from_canonical_u32(G_VERYLOW.unwrap()), lv.opcode_bits[0]);
+    let mut pop_cost = builder.sub_extension(one, lv.opcode_bits[0]);
+    pop_cost = builder.mul_const_extension(F::from_canonical_u32(G_BASE.unwrap()), pop_cost);
+    not_pop_cost = builder.add_extension(not_pop_cost, pop_cost);
+
+    let not_pop_gas_diff = builder.sub_extension(nv_lv_diff, not_pop_cost);
+    let not_pop_constr = builder.mul_extension(filter, not_pop_gas_diff);
+    yield_constr.constraint_transition(builder, not_pop_constr);
+
+    // For JUMPDEST and KECCAK_GENERAL.
+    // JUMPDEST is differentiated from KECCAK_GENERAL by its second bit set to 1.
+    let one = builder.one_extension();
+    let filter = lv.op.jumpdest_keccak_general;
+
+    let jumpdest_keccak_general_gas_cost = builder.arithmetic_extension(
+        F::from_canonical_u32(G_JUMPDEST.unwrap())
+            - F::from_canonical_u32(KERNEL_ONLY_INSTR.unwrap()),
+        F::from_canonical_u32(KERNEL_ONLY_INSTR.unwrap()),
+        lv.opcode_bits[1],
+        one,
+        one,
+    );
+
+    let gas_diff = builder.sub_extension(nv_lv_diff, jumpdest_keccak_general_gas_cost);
+    let constr = builder.mul_extension(filter, gas_diff);
+
+    yield_constr.constraint_transition(builder, constr);
+
+    // For PROVER_INPUT and PUSH operations.
+    // PUSH operations are differentiated from PROVER_INPUT by their 6th bit set to 1.
+    let push_prover_input_gas_cost = builder.arithmetic_extension(
+        F::from_canonical_u32(G_VERYLOW.unwrap())
+            - F::from_canonical_u32(KERNEL_ONLY_INSTR.unwrap()),
+        F::from_canonical_u32(KERNEL_ONLY_INSTR.unwrap()),
+        lv.opcode_bits[5],
+        one,
+        one,
+    );
+    let gas_diff = builder.sub_extension(nv_lv_diff, push_prover_input_gas_cost);
+    let constr = builder.mul_extension(lv.op.push_prover_input, gas_diff);
+
+    yield_constr.constraint_transition(builder, constr);
 }
 
 fn eval_ext_circuit_init<F: RichField + Extendable<D>, const D: usize>(
@@ -246,18 +305,20 @@ fn eval_ext_circuit_init<F: RichField + Extendable<D>, const D: usize>(
     let is_cpu_cycle_next = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| nv[col_i]));
     let filter = builder.mul_sub_extension(is_cpu_cycle, is_cpu_cycle_next, is_cpu_cycle_next);
     // Set initial gas to zero.
-    let constr = builder.mul_extension(filter, nv.gas[0]);
-    yield_constr.constraint_transition(builder, constr);
-    let constr = builder.mul_extension(filter, nv.gas[1]);
+    let constr = builder.mul_extension(filter, nv.gas);
     yield_constr.constraint_transition(builder, constr);
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluate the gas constraints for the opcodes that cost a constant gas.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
+    // Evaluates the transition gas constraints.
     eval_ext_circuit_accumulate(builder, lv, nv, yield_constr);
+    // Evaluates the initial gas constraints.
     eval_ext_circuit_init(builder, lv, nv, yield_constr);
 }
diff --git a/evm/src/cpu/halt.rs b/evm/src/cpu/halt.rs
index 9ad34344ea..80ac32853c 100644
--- a/evm/src/cpu/halt.rs
+++ b/evm/src/cpu/halt.rs
@@ -11,7 +11,8 @@ use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer
 use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 use crate::cpu::membus::NUM_GP_CHANNELS;
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints for the `halt` flag.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -19,13 +20,15 @@ pub fn eval_packed<P: PackedField>(
     let is_cpu_cycle: P = COL_MAP.op.iter().map(|&col_i| lv[col_i]).sum();
     let is_cpu_cycle_next: P = COL_MAP.op.iter().map(|&col_i| nv[col_i]).sum();
 
-    let halt_state = P::ONES - lv.is_bootstrap_kernel - is_cpu_cycle;
-    let next_halt_state = P::ONES - nv.is_bootstrap_kernel - is_cpu_cycle_next;
+    let halt_state = P::ONES - is_cpu_cycle;
+    let next_halt_state = P::ONES - is_cpu_cycle_next;
 
     // The halt flag must be boolean.
     yield_constr.constraint(halt_state * (halt_state - P::ONES));
     // Once we reach a padding row, there must be only padding rows.
     yield_constr.constraint_transition(halt_state * (next_halt_state - P::ONES));
+    // Check that we're in kernel mode.
+    yield_constr.constraint(halt_state * (lv.is_kernel_mode - P::ONES));
 
     // Padding rows should have their memory channels disabled.
     for i in 0..NUM_GP_CHANNELS {
@@ -45,7 +48,9 @@ pub fn eval_packed<P: PackedField>(
     yield_constr.constraint(halt_state * (lv.program_counter - halt_pc));
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints for the `halt` flag.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -56,10 +61,8 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let is_cpu_cycle = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| lv[col_i]));
     let is_cpu_cycle_next = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| nv[col_i]));
 
-    let halt_state = builder.add_extension(lv.is_bootstrap_kernel, is_cpu_cycle);
-    let halt_state = builder.sub_extension(one, halt_state);
-    let next_halt_state = builder.add_extension(nv.is_bootstrap_kernel, is_cpu_cycle_next);
-    let next_halt_state = builder.sub_extension(one, next_halt_state);
+    let halt_state = builder.sub_extension(one, is_cpu_cycle);
+    let next_halt_state = builder.sub_extension(one, is_cpu_cycle_next);
 
     // The halt flag must be boolean.
     let constr = builder.mul_sub_extension(halt_state, halt_state, halt_state);
@@ -67,6 +70,9 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     // Once we reach a padding row, there must be only padding rows.
     let constr = builder.mul_sub_extension(halt_state, next_halt_state, halt_state);
     yield_constr.constraint_transition(builder, constr);
+    // Check that we're in kernel mode.
+    let constr = builder.mul_sub_extension(halt_state, lv.is_kernel_mode, halt_state);
+    yield_constr.constraint(builder, constr);
 
     // Padding rows should have their memory channels disabled.
     for i in 0..NUM_GP_CHANNELS {
diff --git a/evm/src/cpu/jumps.rs b/evm/src/cpu/jumps.rs
index 0c03e2d178..fd7fcfd962 100644
--- a/evm/src/cpu/jumps.rs
+++ b/evm/src/cpu/jumps.rs
@@ -9,7 +9,8 @@ use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::membus::NUM_GP_CHANNELS;
 use crate::memory::segments::Segment;
 
-pub fn eval_packed_exit_kernel<P: PackedField>(
+/// Evaluates constraints for EXIT_KERNEL.
+pub(crate) fn eval_packed_exit_kernel<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -22,11 +23,14 @@ pub fn eval_packed_exit_kernel<P: PackedField>(
     // but we trust the kernel to set them to zero).
     yield_constr.constraint_transition(filter * (input[0] - nv.program_counter));
     yield_constr.constraint_transition(filter * (input[1] - nv.is_kernel_mode));
-    yield_constr.constraint_transition(filter * (input[6] - nv.gas[0]));
-    yield_constr.constraint_transition(filter * (input[7] - nv.gas[1]));
+    yield_constr.constraint_transition(filter * (input[6] - nv.gas));
+    // High limb of gas must be 0 for convenient detection of overflow.
+    yield_constr.constraint(filter * input[7]);
 }
 
-pub fn eval_ext_circuit_exit_kernel<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed_exit_kernel`.
+/// Evaluates constraints for EXIT_KERNEL.
+pub(crate) fn eval_ext_circuit_exit_kernel<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -48,18 +52,19 @@ pub fn eval_ext_circuit_exit_kernel<F: RichField + Extendable<D>, const D: usize
     yield_constr.constraint_transition(builder, kernel_constr);
 
     {
-        let diff = builder.sub_extension(input[6], nv.gas[0]);
+        let diff = builder.sub_extension(input[6], nv.gas);
         let constr = builder.mul_extension(filter, diff);
         yield_constr.constraint_transition(builder, constr);
     }
     {
-        let diff = builder.sub_extension(input[7], nv.gas[1]);
-        let constr = builder.mul_extension(filter, diff);
-        yield_constr.constraint_transition(builder, constr);
+        // High limb of gas must be 0 for convenient detection of overflow.
+        let constr = builder.mul_extension(filter, input[7]);
+        yield_constr.constraint(builder, constr);
     }
 }
 
-pub fn eval_packed_jump_jumpi<P: PackedField>(
+/// Evaluates constraints jump operations: JUMP and JUMPI.
+pub(crate) fn eval_packed_jump_jumpi<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -82,7 +87,8 @@ pub fn eval_packed_jump_jumpi<P: PackedField>(
     yield_constr.constraint_transition(new_filter * (channel.is_read - P::ONES));
     yield_constr.constraint_transition(new_filter * (channel.addr_context - nv.context));
     yield_constr.constraint_transition(
-        new_filter * (channel.addr_segment - P::Scalar::from_canonical_u64(Segment::Stack as u64)),
+        new_filter
+            * (channel.addr_segment - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
     );
     let addr_virtual = nv.stack_len - P::ONES;
     yield_constr.constraint_transition(new_filter * (channel.addr_virtual - addr_virtual));
@@ -129,7 +135,7 @@ pub fn eval_packed_jump_jumpi<P: PackedField>(
     yield_constr.constraint(
         filter
             * (jumpdest_flag_channel.addr_segment
-                - P::Scalar::from_canonical_u64(Segment::JumpdestBits as u64)),
+                - P::Scalar::from_canonical_usize(Segment::JumpdestBits.unscale())),
     );
     yield_constr.constraint(filter * (jumpdest_flag_channel.addr_virtual - dst[0]));
 
@@ -137,6 +143,8 @@ pub fn eval_packed_jump_jumpi<P: PackedField>(
     for &channel in &lv.mem_channels[2..NUM_GP_CHANNELS - 1] {
         yield_constr.constraint(filter * channel.used);
     }
+    yield_constr.constraint(filter * lv.partial_channel.used);
+
     // Channel 1 is unused by the `JUMP` instruction.
     yield_constr.constraint(is_jump * lv.mem_channels[1].used);
 
@@ -156,7 +164,9 @@ pub fn eval_packed_jump_jumpi<P: PackedField>(
         .constraint_transition(filter * jumps_lv.should_jump * (nv.program_counter - jump_dest));
 }
 
-pub fn eval_ext_circuit_jump_jumpi<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed_jumpi_jumpi`.
+/// Evaluates constraints jump operations: JUMP and JUMPI.
+pub(crate) fn eval_ext_circuit_jump_jumpi<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -196,7 +206,7 @@ pub fn eval_ext_circuit_jump_jumpi<F: RichField + Extendable<D>, const D: usize>
     {
         let constr = builder.arithmetic_extension(
             F::ONE,
-            -F::from_canonical_u64(Segment::Stack as u64),
+            -F::from_canonical_usize(Segment::Stack.unscale()),
             new_filter,
             channel.addr_segment,
             new_filter,
@@ -299,7 +309,7 @@ pub fn eval_ext_circuit_jump_jumpi<F: RichField + Extendable<D>, const D: usize>
     {
         let constr = builder.arithmetic_extension(
             F::ONE,
-            -F::from_canonical_u64(Segment::JumpdestBits as u64),
+            -F::from_canonical_usize(Segment::JumpdestBits.unscale()),
             filter,
             jumpdest_flag_channel.addr_segment,
             filter,
@@ -317,6 +327,10 @@ pub fn eval_ext_circuit_jump_jumpi<F: RichField + Extendable<D>, const D: usize>
         let constr = builder.mul_extension(filter, channel.used);
         yield_constr.constraint(builder, constr);
     }
+    {
+        let constr = builder.mul_extension(filter, lv.partial_channel.used);
+        yield_constr.constraint(builder, constr);
+    }
     // Channel 1 is unused by the `JUMP` instruction.
     {
         let constr = builder.mul_extension(is_jump, lv.mem_channels[1].used);
@@ -353,7 +367,8 @@ pub fn eval_ext_circuit_jump_jumpi<F: RichField + Extendable<D>, const D: usize>
     }
 }
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints for EXIT_KERNEL, JUMP and JUMPI.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -362,7 +377,9 @@ pub fn eval_packed<P: PackedField>(
     eval_packed_jump_jumpi(lv, nv, yield_constr);
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints for EXIT_KERNEL, JUMP and JUMPI.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
diff --git a/evm/src/cpu/kernel/aggregator.rs b/evm/src/cpu/kernel/aggregator.rs
index bda2ab610e..6376552550 100644
--- a/evm/src/cpu/kernel/aggregator.rs
+++ b/evm/src/cpu/kernel/aggregator.rs
@@ -43,6 +43,7 @@ pub(crate) fn combined_kernel() -> Kernel {
         include_str!("asm/core/log.asm"),
         include_str!("asm/core/selfdestruct_list.asm"),
         include_str!("asm/core/touched_addresses.asm"),
+        include_str!("asm/core/withdrawals.asm"),
         include_str!("asm/core/precompiles/main.asm"),
         include_str!("asm/core/precompiles/ecrec.asm"),
         include_str!("asm/core/precompiles/sha256.asm"),
@@ -121,8 +122,6 @@ pub(crate) fn combined_kernel() -> Kernel {
         include_str!("asm/mpt/insert/insert_extension.asm"),
         include_str!("asm/mpt/insert/insert_leaf.asm"),
         include_str!("asm/mpt/insert/insert_trie_specific.asm"),
-        include_str!("asm/mpt/load/load.asm"),
-        include_str!("asm/mpt/load/load_trie_specific.asm"),
         include_str!("asm/mpt/read.asm"),
         include_str!("asm/mpt/storage/storage_read.asm"),
         include_str!("asm/mpt/storage/storage_write.asm"),
diff --git a/evm/src/cpu/kernel/asm/account_code.asm b/evm/src/cpu/kernel/asm/account_code.asm
index ee19819837..2654bedc7b 100644
--- a/evm/src/cpu/kernel/asm/account_code.asm
+++ b/evm/src/cpu/kernel/asm/account_code.asm
@@ -2,13 +2,13 @@ global sys_extcodehash:
     // stack: kexit_info, address
     SWAP1 %u256_to_addr
     // stack: address, kexit_info
-    DUP1 %insert_accessed_addresses
-    // stack: cold_access, address, kexit_info
+    SWAP1
+    DUP2 %insert_accessed_addresses
+    // stack: cold_access, kexit_info, address
     PUSH @GAS_COLDACCOUNTACCESS_MINUS_WARMACCESS
     MUL
     PUSH @GAS_WARMACCESS
     ADD
-    %stack (gas, address, kexit_info) -> (gas, kexit_info, address)
     %charge_gas
     // stack: kexit_info, address
 
@@ -48,8 +48,8 @@ retzero:
 %endmacro
 
 %macro extcodesize
-    %stack (address) -> (address, 0, @SEGMENT_KERNEL_ACCOUNT_CODE, %%after)
-    %jump(load_code)
+    %stack (address) -> (address, %%after)
+    %jump(extcodesize)
 %%after:
 %endmacro
 
@@ -57,13 +57,13 @@ global sys_extcodesize:
     // stack: kexit_info, address
     SWAP1 %u256_to_addr
     // stack: address, kexit_info
-    DUP1 %insert_accessed_addresses
-    // stack: cold_access, address, kexit_info
+    SWAP1
+    DUP2 %insert_accessed_addresses
+    // stack: cold_access, kexit_info, address
     PUSH @GAS_COLDACCOUNTACCESS_MINUS_WARMACCESS
     MUL
     PUSH @GAS_WARMACCESS
     ADD
-    %stack (gas, address, kexit_info) -> (gas, kexit_info, address)
     %charge_gas
     // stack: kexit_info, address
 
@@ -76,157 +76,61 @@ global sys_extcodesize:
 
 global extcodesize:
     // stack: address, retdest
-    %extcodesize
-    // stack: extcodesize(address), retdest
-    SWAP1 JUMP
-
-%macro extcodecopy
-    // stack: address, dest_offset, offset, size
-    %stack (address, dest_offset, offset, size) -> (address, dest_offset, offset, size, %%after)
-    %jump(extcodecopy)
-%%after:
-%endmacro
-
-// Pre stack: kexit_info, address, dest_offset, offset, size
-// Post stack: (empty)
-global sys_extcodecopy:
-    %stack (kexit_info, address, dest_offset, offset, size)
-        -> (address, dest_offset, offset, size, kexit_info)
-    %u256_to_addr DUP1 %insert_accessed_addresses
-    // stack: cold_access, address, dest_offset, offset, size, kexit_info
-    PUSH @GAS_COLDACCOUNTACCESS_MINUS_WARMACCESS
-    MUL
-    PUSH @GAS_WARMACCESS
-    ADD
-    // stack: Gaccess, address, dest_offset, offset, size, kexit_info
-
-    DUP5
-    // stack: size, Gaccess, address, dest_offset, offset, size, kexit_info
-    ISZERO %jumpi(sys_extcodecopy_empty)
-
-    // stack: Gaccess, address, dest_offset, offset, size, kexit_info
-    DUP5 %num_bytes_to_num_words %mul_const(@GAS_COPY) ADD
-    %stack (gas, address, dest_offset, offset, size, kexit_info) -> (gas, kexit_info, address, dest_offset, offset, size)
-    %charge_gas
-
-    %stack (kexit_info, address, dest_offset, offset, size) -> (dest_offset, size, kexit_info, address, dest_offset, offset, size)
-    %add_or_fault
-    // stack: expanded_num_bytes, kexit_info, address, dest_offset, offset, size
-    DUP1 %ensure_reasonable_offset
-    %update_mem_bytes
-
-    %stack (kexit_info, address, dest_offset, offset, size) -> (address, dest_offset, offset, size, kexit_info)
-    %extcodecopy
-    // stack: kexit_info
-    EXIT_KERNEL
-
-sys_extcodecopy_empty:
-    %stack (Gaccess, address, dest_offset, offset, size, kexit_info) -> (Gaccess, kexit_info)
-    %charge_gas
-    EXIT_KERNEL
-
-
-// Pre stack: address, dest_offset, offset, size, retdest
-// Post stack: (empty)
-global extcodecopy:
-    // stack: address, dest_offset, offset, size, retdest
-    %stack (address, dest_offset, offset, size, retdest)
-        -> (address, 0, @SEGMENT_KERNEL_ACCOUNT_CODE, extcodecopy_contd, size, offset, dest_offset, retdest)
+    %next_context_id
+    // stack: codesize_ctx, address, retdest
+    SWAP1
+    // stack: address, codesize_ctx, retdest
     %jump(load_code)
 
-extcodecopy_contd:
-    // stack: code_size, size, offset, dest_offset, retdest
-    DUP1 DUP4
-    // stack: offset, code_size, code_size, size, offset, dest_offset, retdest
-    GT %jumpi(extcodecopy_large_offset)
-
-    // stack: code_size, size, offset, dest_offset, retdest
-    DUP3 DUP3 ADD
-    // stack: offset + size, code_size, size, offset, dest_offset, retdest
-    DUP2 GT %jumpi(extcodecopy_within_bounds)
-
-    // stack: code_size, size, offset, dest_offset, retdest
-    DUP3 DUP3 ADD
-    // stack: offset + size, code_size, size, offset, dest_offset, retdest
-    SUB
-    // stack: extra_size = offset + size - code_size, size, offset, dest_offset, retdest
-    DUP1 DUP3 SUB
-    // stack: copy_size = size - extra_size, extra_size, size, offset, dest_offset, retdest
-
-    // Compute the new dest_offset after actual copies, at which we will start padding with zeroes.
-    DUP1 DUP6 ADD
-    // stack: new_dest_offset, copy_size, extra_size, size, offset, dest_offset, retdest
-
-    GET_CONTEXT
-    %stack (context, new_dest_offset, copy_size, extra_size, size, offset, dest_offset, retdest) ->
-        (context, @SEGMENT_MAIN_MEMORY, dest_offset, 0, @SEGMENT_KERNEL_ACCOUNT_CODE, offset, copy_size, extcodecopy_end, new_dest_offset, extra_size, retdest)
-    %jump(memcpy_bytes)
-
-extcodecopy_within_bounds:
-    // stack: code_size, size, offset, dest_offset, retdest
-    GET_CONTEXT
-    %stack (context, code_size, size, offset, dest_offset, retdest) ->
-        (context, @SEGMENT_MAIN_MEMORY, dest_offset, 0, @SEGMENT_KERNEL_ACCOUNT_CODE, offset, size, retdest)
-    %jump(memcpy_bytes)
-
-// Same as extcodecopy_large_offset, but without `offset` in the stack.
-extcodecopy_end:
-    // stack: dest_offset, size, retdest
-    GET_CONTEXT
-    %stack (context, dest_offset, size, retdest) ->
-        (context, @SEGMENT_MAIN_MEMORY, dest_offset, size, retdest)
-    %jump(memset)
-
-extcodecopy_large_offset:
-    // offset is larger than the code size. So we just have to write zeros.
-    // stack: code_size, size, offset, dest_offset, retdest
-    GET_CONTEXT
-    %stack (context, code_size, size, offset, dest_offset, retdest) -> (context, @SEGMENT_MAIN_MEMORY, dest_offset, size, retdest)
-    %jump(memset)
-
-// Loads the code at `address` into memory, at the given context and segment, starting at offset 0.
+// Loads the code at `address` into memory, in the code segment of the given context, starting at offset 0.
 // Checks that the hash of the loaded code corresponds to the `codehash` in the state trie.
-// Pre stack: address, ctx, segment, retdest
+// Pre stack: address, ctx, retdest
 // Post stack: code_size
+//
+// NOTE: The provided `dest` **MUST** have a virtual address of 0.
 global load_code:
-    %stack (address, ctx, segment, retdest) -> (extcodehash, address, load_code_ctd, ctx, segment, retdest)
+    %stack (address, ctx, retdest) -> (extcodehash, address, load_code_ctd, ctx, retdest)
     JUMP
 load_code_ctd:
-    // stack: codehash, ctx, segment, retdest
+    // stack: codehash, ctx, retdest
     DUP1 ISZERO %jumpi(load_code_non_existent_account)
-    PROVER_INPUT(account_code::length)
-    // stack: code_size, codehash, ctx, segment, retdest
-    PUSH 0
-
-// Loop non-deterministically querying `code[i]` and storing it in `SEGMENT_KERNEL_ACCOUNT_CODE`
-// at offset `i`, until `i==code_size`.
-load_code_loop:
-    // stack: i, code_size, codehash, ctx, segment, retdest
-    DUP2 DUP2 EQ
-    // stack: i == code_size, i, code_size, codehash, ctx, segment, retdest
-    %jumpi(load_code_check)
-    PROVER_INPUT(account_code::get)
-    // stack: opcode, i, code_size, codehash, ctx, segment, retdest
-    DUP2
-    // stack: i, opcode, i, code_size, codehash, ctx, segment, retdest
-    DUP7 // segment
-    DUP7 // context
-    MSTORE_GENERAL
-    // stack: i, code_size, codehash, ctx, segment, retdest
-    %increment
-    // stack: i+1, code_size, codehash, ctx, segment, retdest
-    %jump(load_code_loop)
-
-// Check that the hash of the loaded code equals `codehash`.
-load_code_check:
-    // stack: i, code_size, codehash, ctx, segment, retdest
-    %stack (i, code_size, codehash, ctx, segment, retdest)
-        -> (ctx, segment, 0, code_size, codehash, retdest, code_size)
+    // Load the code non-deterministically in memory and return the length.
+    PROVER_INPUT(account_code)
+    %stack (code_size, codehash, ctx, retdest) -> (ctx, code_size, codehash, retdest, code_size)
+    // Check that the hash of the loaded code equals `codehash`.
+    // ctx == DST, as SEGMENT_CODE == offset == 0.
     KECCAK_GENERAL
     // stack: shouldbecodehash, codehash, retdest, code_size
     %assert_eq
+    // stack: retdest, code_size
     JUMP
 
 load_code_non_existent_account:
-    %stack (codehash, ctx, segment, retdest) -> (retdest, 0)
+    // Write 0 at address 0 for soundness: SEGMENT_CODE == 0, hence ctx == addr.
+    // stack: codehash, addr, retdest
+    %stack (codehash, addr, retdest) -> (0, addr, retdest, 0)
+    MSTORE_GENERAL
+    // stack: retdest, 0
+    JUMP
+
+// Identical to load_code, but adds 33 zeros after code_size for soundness reasons.
+// If the code ends with an incomplete PUSH, we must make sure that every subsequent read is 0,
+// accordingly to the Ethereum specs.
+// Pre stack: address, ctx, retdest
+// Post stack: code_size
+global load_code_padded:
+    %stack (address, ctx, retdest) -> (address, ctx, load_code_padded_ctd, ctx, retdest)
+    %jump(load_code)
+
+load_code_padded_ctd:
+    // SEGMENT_CODE == 0.
+    // stack: code_size, ctx, retdest
+    %stack (code_size, ctx, retdest) -> (ctx, code_size, 0, retdest, code_size)
+    ADD 
+    // stack: addr, 0, retdest, code_size
+    MSTORE_32BYTES_32
+    // stack: addr', retdest, code_size
+    PUSH 0
+    MSTORE_GENERAL
+    // stack: retdest, code_size
     JUMP
diff --git a/evm/src/cpu/kernel/asm/balance.asm b/evm/src/cpu/kernel/asm/balance.asm
index f175d027c9..d39f660630 100644
--- a/evm/src/cpu/kernel/asm/balance.asm
+++ b/evm/src/cpu/kernel/asm/balance.asm
@@ -2,13 +2,13 @@ global sys_balance:
     // stack: kexit_info, address
     SWAP1 %u256_to_addr
     // stack: address, kexit_info
-    DUP1 %insert_accessed_addresses
-    // stack: cold_access, address, kexit_info
+    SWAP1
+    DUP2 %insert_accessed_addresses
+    // stack: cold_access, kexit_info, address
     PUSH @GAS_COLDACCOUNTACCESS_MINUS_WARMACCESS
     MUL
     PUSH @GAS_WARMACCESS
     ADD
-    %stack (gas, address, kexit_info) -> (gas, kexit_info, address)
     %charge_gas
     // stack: kexit_info, address
 
diff --git a/evm/src/cpu/kernel/asm/bignum/add.asm b/evm/src/cpu/kernel/asm/bignum/add.asm
index c9070dd107..4433ab2245 100644
--- a/evm/src/cpu/kernel/asm/bignum/add.asm
+++ b/evm/src/cpu/kernel/asm/bignum/add.asm
@@ -9,49 +9,55 @@ global add_bignum:
     ISZERO
     %jumpi(len_zero)
     // stack: len, a_start_loc, b_start_loc, retdest
+    %build_current_general_address_no_offset
     PUSH 0
-    // stack: carry=0, i=len, a_cur_loc=a_start_loc, b_cur_loc=b_start_loc, retdest
+    // stack: carry=0, base_addr, i=len, a_cur_loc=a_start_loc, b_cur_loc=b_start_loc, retdest
 add_loop:
-    // stack: carry, i, a_cur_loc, b_cur_loc, retdest
-    DUP4
-    %mload_current_general
-    // stack: b[cur], carry, i, a_cur_loc, b_cur_loc, retdest
-    DUP4
-    %mload_current_general
-    // stack: a[cur], b[cur], carry, i, a_cur_loc, b_cur_loc, retdest
+    // stack: carry, base_addr, i, a_cur_loc, b_cur_loc, retdest
+    DUP2
+    // stack: base_addr, carry, base_addr, i, a_cur_loc, b_cur_loc, retdest
+    DUP6 ADD // base_addr + b_cur_loc
+    MLOAD_GENERAL
+    // stack: b[cur], carry, base_addr, i, a_cur_loc, b_cur_loc, retdest
+    DUP3
+    DUP6 ADD // base_addr + a_cur_loc
+    MLOAD_GENERAL
+    // stack: a[cur], b[cur], carry, base_addr, i, a_cur_loc, b_cur_loc, retdest
     ADD
     ADD
-    // stack: a[cur] + b[cur] + carry, i, a_cur_loc, b_cur_loc, retdest
+    // stack: a[cur] + b[cur] + carry, base_addr, i, a_cur_loc, b_cur_loc, retdest
     DUP1
-    // stack: a[cur] + b[cur] + carry, a[cur] + b[cur] + carry, i, a_cur_loc, b_cur_loc, retdest
+    // stack: a[cur] + b[cur] + carry, a[cur] + b[cur] + carry, base_addr, i, a_cur_loc, b_cur_loc, retdest
     %shr_const(128)
-    // stack: (a[cur] + b[cur] + carry) // 2^128, a[cur] + b[cur] + carry, i, a_cur_loc, b_cur_loc, retdest
+    // stack: (a[cur] + b[cur] + carry) // 2^128, a[cur] + b[cur] + carry, base_addr, i, a_cur_loc, b_cur_loc, retdest
     SWAP1
-    // stack: a[cur] + b[cur] + carry, (a[cur] + b[cur] + carry) // 2^128, i, a_cur_loc, b_cur_loc, retdest
+    // stack: a[cur] + b[cur] + carry, (a[cur] + b[cur] + carry) // 2^128, base_addr, i, a_cur_loc, b_cur_loc, retdest
     %mod_const(0x100000000000000000000000000000000)
-    // stack: c[cur] = (a[cur] + b[cur] + carry) % 2^128, carry_new = (a[cur] + b[cur] + carry) // 2^128, i, a_cur_loc, b_cur_loc, retdest
-    DUP4
-    // stack: a_cur_loc, c[cur], carry_new, i, a_cur_loc, b_cur_loc, retdest
-    %mstore_current_general
-    // stack: carry_new, i, a_cur_loc, b_cur_loc, retdest
-    SWAP2
-    %increment
-    SWAP2
-    // stack: carry_new, i, a_cur_loc + 1, b_cur_loc, retdest
+    // stack: c[cur] = (a[cur] + b[cur] + carry) % 2^128, carry_new = (a[cur] + b[cur] + carry) // 2^128, base_addr, i, a_cur_loc, b_cur_loc, retdest
+    DUP3
+    DUP6
+    ADD // base_addr + a_cur_loc
+    // stack: a_cur_addr, c[cur], carry_new,  base_addr, i, a_cur_loc, b_cur_loc, retdest
+    %swap_mstore
+    // stack: carry_new, base_addr, i, a_cur_loc, b_cur_loc, retdest
     SWAP3
     %increment
     SWAP3
-    // stack: carry_new, i, a_cur_loc + 1, b_cur_loc + 1, retdest
-    SWAP1
+    // stack: carry_new, base_addr, i, a_cur_loc + 1, b_cur_loc, retdest
+    SWAP4
+    %increment
+    SWAP4
+    // stack: carry_new, base_addr, i, a_cur_loc + 1, b_cur_loc + 1, retdest
+    SWAP2
     %decrement
-    SWAP1
-    // stack: carry_new, i - 1, a_cur_loc + 1, b_cur_loc + 1, retdest
-    DUP2
-    // stack: i - 1, carry_new, i - 1, a_cur_loc + 1, b_cur_loc + 1, retdest
+    SWAP2
+    // stack: carry_new, base_addr, i - 1, a_cur_loc + 1, b_cur_loc + 1, retdest
+    DUP3
+    // stack: i - 1, carry_new, base_addr, i - 1, a_cur_loc + 1, b_cur_loc + 1, retdest
     %jumpi(add_loop)
 add_end:
-    // stack: carry_new, i - 1, a_cur_loc + 1, b_cur_loc + 1, retdest
-    %stack (c, i, a, b) -> (c)
+    // stack: carry_new, base_addr, i - 1, a_cur_loc + 1, b_cur_loc + 1, retdest
+    %stack (c, addr, i, a, b) -> (c)
     // stack: carry_new, retdest
     SWAP1
     // stack: retdest, carry_new
diff --git a/evm/src/cpu/kernel/asm/bignum/addmul.asm b/evm/src/cpu/kernel/asm/bignum/addmul.asm
index 13e59e6d80..9cdf904e1f 100644
--- a/evm/src/cpu/kernel/asm/bignum/addmul.asm
+++ b/evm/src/cpu/kernel/asm/bignum/addmul.asm
@@ -8,95 +8,99 @@ global addmul_bignum:
     // stack: len, len, a_start_loc, b_start_loc, val, retdest
     ISZERO
     %jumpi(len_zero)
+    %build_current_general_address_no_offset
     PUSH 0
-    // stack: carry_limb=0, i=len, a_cur_loc=a_start_loc, b_cur_loc=b_start_loc, val, retdest
+    // stack: carry_limb=0, base_addr, i=len, a_cur_loc=a_start_loc, b_cur_loc=b_start_loc, val, retdest
 addmul_loop:
-    // stack: carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
-    DUP4
-    // stack: b_cur_loc, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
-    %mload_current_general
-    // stack: b[cur], carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
-    DUP6
-    // stack: val, b[cur], carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP2
+    DUP6 ADD // base_addr + b_cur_loc
+    // stack: b_cur_addr, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
+    MLOAD_GENERAL
+    // stack: b[cur], carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP7
+    // stack: val, b[cur], carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     MUL
-    // stack: val * b[cur], carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: val * b[cur], carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     DUP1
-    // stack: val * b[cur], val * b[cur], carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: val * b[cur], val * b[cur], carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     %shr_const(128)
-    // stack: (val * b[cur]) // 2^128, val * b[cur], carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: (val * b[cur]) // 2^128, val * b[cur], carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     SWAP1
-    // stack: val * b[cur], (val * b[cur]) // 2^128, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: val * b[cur], (val * b[cur]) // 2^128, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     %shl_const(128)
     %shr_const(128)
-    // stack: prod_lo = val * b[cur] % 2^128, prod_hi = (val * b[cur]) // 2^128, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
-    DUP5
-    // stack: a_cur_loc, prod_lo, prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
-    %mload_current_general
-    // stack: a[cur], prod_lo, prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_lo = val * b[cur] % 2^128, prod_hi = (val * b[cur]) // 2^128, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP4
+    DUP7 ADD // base_addr + a_cur_loc
+    // stack: a_cur_addr, prod_lo, prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
+    MLOAD_GENERAL
+    // stack: a[cur], prod_lo, prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     DUP1
-    // stack: a[cur], a[cur], prod_lo, prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: a[cur], a[cur], prod_lo, prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     SWAP2
-    // stack: prod_lo, a[cur], a[cur], prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_lo, a[cur], a[cur], prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     ADD
     %shl_const(128)
     %shr_const(128)
-    // stack: prod_lo' = (prod_lo + a[cur]) % 2^128, a[cur], prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_lo' = (prod_lo + a[cur]) % 2^128, a[cur], prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     DUP1
-    // stack: prod_lo', prod_lo', a[cur], prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_lo', prod_lo', a[cur], prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     SWAP2
-    // stack: a[cur], prod_lo', prod_lo', prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: a[cur], prod_lo', prod_lo', prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     GT
-    // stack: prod_lo_carry_limb = a[cur] > prod_lo', prod_lo', prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_lo_carry_limb = a[cur] > prod_lo', prod_lo', prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     SWAP1
-    // stack: prod_lo', prod_lo_carry_limb, prod_hi, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_lo', prod_lo_carry_limb, prod_hi, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     SWAP2
-    // stack: prod_hi, prod_lo_carry_limb, prod_lo', carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_hi, prod_lo_carry_limb, prod_lo', carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     ADD
-    // stack: prod_hi' = prod_hi + prod_lo_carry_limb, prod_lo', carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_hi' = prod_hi + prod_lo_carry_limb, prod_lo', carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     DUP3
-    // stack: carry_limb, prod_hi', prod_lo', carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: carry_limb, prod_hi', prod_lo', carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     DUP3
-    // stack: prod_lo', carry_limb, prod_hi', prod_lo', carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_lo', carry_limb, prod_hi', prod_lo', carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     ADD
     %shl_const(128)
     %shr_const(128)
-    // stack: to_write = (prod_lo' + carry_limb) % 2^128, prod_hi', prod_lo', carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: to_write = (prod_lo' + carry_limb) % 2^128, prod_hi', prod_lo', carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     SWAP2
-    // stack: prod_lo', prod_hi', to_write, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: prod_lo', prod_hi', to_write, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     DUP3
-    // stack: to_write, prod_lo', prod_hi', to_write, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: to_write, prod_lo', prod_hi', to_write, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
     LT
     // stack: carry_limb_new = to_write < prod_lo', prod_hi', to_write, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
     %stack (vals: 3, c) -> (vals)
-    // stack: carry_limb_new, prod_hi', to_write, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: carry_limb_new, prod_hi', to_write, addr, i, a_cur_loc, b_cur_loc, val, retdest
     ADD
-    // stack: carry_limb = carry_limb_new' + prod_hi', to_write, i, a_cur_loc, b_cur_loc, val, retdest
+    // stack: carry_limb = carry_limb_new' + prod_hi', to_write, addr, i, a_cur_loc, b_cur_loc, val, retdest
     SWAP1
-    // stack: to_write, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
-    DUP4
-    // stack: a_cur_loc, to_write, carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
-    %mstore_current_general
-    // stack: carry_limb, i, a_cur_loc, b_cur_loc, val, retdest
-    SWAP1
-    // stack: i, carry_limb, a_cur_loc, b_cur_loc, val, retdest
-    %decrement
-    // stack: i-1, carry_limb, a_cur_loc, b_cur_loc, val, retdest
+    // stack: to_write, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP3
+    DUP6 ADD // base_addr + a_cur_loc
+    // stack: a_cur_addr, to_write, carry_limb, addr, i, a_cur_loc, b_cur_loc, val, retdest
+    %swap_mstore
+    // stack: carry_limb, base_addr, i, a_cur_loc, b_cur_loc, val, retdest
     SWAP2
-    // stack: a_cur_loc, carry_limb, i-1, b_cur_loc, val, retdest
-    %increment
-    // stack: a_cur_loc+1, carry_limb, i-1, b_cur_loc, val, retdest
+    // stack: i, base_addr, carry_limb, a_cur_loc, b_cur_loc, val, retdest
+    %decrement
+    // stack: i-1, base_addr, carry_limb, a_cur_loc, b_cur_loc, val, retdest
     SWAP3
-    // stack: b_cur_loc, carry_limb, i-1, a_cur_loc+1, val, retdest
+    // stack: a_cur_loc, base_addr, carry_limb, i-1, b_cur_loc, val, retdest
     %increment
-    // stack: b_cur_loc+1, carry_limb, i-1, a_cur_loc+1, val, retdest
-    %stack (b, c, i, a) -> (c, i, a, b)
-    // stack: carry_limb, i-1, a_cur_loc+1, b_cur_loc+1, val, retdest
-    DUP2
-    // stack: i-1, carry_limb, i-1, a_cur_loc+1, b_cur_loc+1, val, retdest
+    // stack: a_cur_loc+1, base_addr, carry_limb, i-1, b_cur_loc, val, retdest
+    SWAP4
+    // stack: b_cur_loc, base_addr, carry_limb, i-1, a_cur_loc+1, val, retdest
+    %increment
+    // stack: b_cur_loc+1, base_addr, carry_limb, i-1, a_cur_loc+1, val, retdest
+    %stack (b, addr, c, i, a) -> (c, addr, i, a, b)
+    // stack: carry_limb, base_addr, i-1, a_cur_loc+1, b_cur_loc+1, val, retdest
+    DUP3
+    // stack: i-1, carry_limb, base_addr, i-1, a_cur_loc+1, b_cur_loc+1, val, retdest
     %jumpi(addmul_loop)
 addmul_end:
-    // stack: carry_limb_new, i-1, a_cur_loc+1, b_cur_loc+1, val, retdest
-    %stack (c, i, a, b, v) -> (c)
+    // stack: carry_limb_new, base_addr, i-1, a_cur_loc+1, b_cur_loc+1, val, retdest
+    %stack (c, addr, i, a, b, v) -> (c)
     // stack: carry_limb_new, retdest
     SWAP1
     // stack: retdest, carry_limb_new
diff --git a/evm/src/cpu/kernel/asm/bignum/cmp.asm b/evm/src/cpu/kernel/asm/bignum/cmp.asm
index d5abd238fe..c27687542e 100644
--- a/evm/src/cpu/kernel/asm/bignum/cmp.asm
+++ b/evm/src/cpu/kernel/asm/bignum/cmp.asm
@@ -5,80 +5,87 @@
 // Returns 1 if a > b, 0 if a == b, and -1 (that is, 2^256 - 1) if a < b.
 global cmp_bignum:
     // stack: len, a_start_loc, b_start_loc, retdest
-    DUP1
-    // stack: len, len, a_start_loc, b_start_loc, retdest
-    ISZERO
-    %jumpi(equal)
-    // stack: len, a_start_loc, b_start_loc, retdest
-    SWAP1
-    // stack: a_start_loc, len, b_start_loc, retdest
+    %build_current_general_address_no_offset
+    // stack: base_addr, len, a_start_loc, b_start_loc, retdest
     DUP2
-    // stack: len, a_start_loc, len, b_start_loc, retdest
-    ADD
-    %decrement
-    // stack: a_end_loc, len, b_start_loc, retdest
+    // stack: len, base_addr, len, a_start_loc, b_start_loc, retdest
+    ISZERO
+    %jumpi(equal) // len and base_addr are swapped, but they will be popped anyway
+    // stack: base_addr, len, a_start_loc, b_start_loc, retdest
     SWAP2
-    // stack: b_start_loc, len, a_end_loc, retdest
-    DUP2
-    // stack: len, b_start_loc, len, a_end_loc, retdest
+    // stack: a_start_loc, len, base_addr, b_start_loc, retdest
+    PUSH 1
+    DUP3
+    SUB
+    // stack: len-1, a_start_loc, len, base_addr, b_start_loc, retdest
     ADD
-    %decrement
-    // stack: b_end_loc, len, a_end_loc, retdest
-    %stack (b, l, a) -> (l, a, b)
-    // stack: len, a_end_loc, b_end_loc, retdest
+    // stack: a_end_loc, len, base_addr, b_start_loc, retdest
+    SWAP3
+    // stack: b_start_loc, len, base_addr, a_end_loc, retdest
+    PUSH 1
+    DUP3
+    SUB
+    // stack: len-1, b_start_loc, len, base_addr, a_end_loc, retdest
+    ADD
+    // stack: b_end_loc, len, base_addr, a_end_loc, retdest
+
+    %stack (b, l, addr, a) -> (l, addr, a, b)
+    // stack: len, base_addr, a_end_loc, b_end_loc, retdest
     %decrement
 ge_loop:
-    // stack: i, a_i_loc, b_i_loc, retdest
-    DUP3
-    DUP3
-    // stack: a_i_loc, b_i_loc, i, a_i_loc, b_i_loc, retdest
-    %mload_current_general
-    SWAP1
-    %mload_current_general
-    SWAP1
-    // stack: a[i], b[i], i, a_i_loc, b_i_loc, retdest
+    // stack: i, base_addr, a_i_loc, b_i_loc, retdest
+    DUP4
+    // stack: b_i_loc, i, base_addr, a_i_loc, b_i_loc, retdest
+    DUP3 ADD // b_i_addr
+    MLOAD_GENERAL
+    // stack: b[i], i, base_addr, a_i_loc, b_i_loc, retdest
+    DUP4
+    // stack: a_i_loc, b[i], i, base_addr, a_i_loc, b_i_loc, retdest
+    DUP4 ADD // a_i_addr
+    MLOAD_GENERAL
+    // stack: a[i], b[i], i, base_addr, a_i_loc, b_i_loc, retdest
     %stack (vals: 2) -> (vals, vals)
     GT
     %jumpi(greater)
-    // stack: a[i], b[i], i, a_i_loc, b_i_loc, retdest
+    // stack: a[i], b[i], i, base_addr, a_i_loc, b_i_loc, retdest
     LT
     %jumpi(less)
-    // stack: i, a_i_loc, b_i_loc, retdest
+    // stack: i, base_addr, a_i_loc, b_i_loc, retdest
     DUP1
     ISZERO
     %jumpi(equal)
     %decrement
-    // stack: i-1, a_i_loc, b_i_loc, retdest
-    SWAP1
-    // stack: a_i_loc, i-1, b_i_loc, retdest
-    %decrement
-    // stack: a_i_loc_new, i-1, b_i_loc, retdest
+    // stack: i-1, base_addr, a_i_loc, b_i_loc, retdest
     SWAP2
-    // stack: b_i_loc, i-1, a_i_loc_new, retdest
+    // stack: a_i_loc, base_addr, i-1, b_i_loc, retdest
+    %decrement
+    // stack: a_i_loc_new, base_addr, i-1, b_i_loc, retdest
+    SWAP3
+    // stack: b_i_loc, base_addr, i-1, a_i_loc_new, retdest
     %decrement
-    // stack: b_i_loc_new, i-1, a_i_loc_new, retdest
-    %stack (b, i, a) -> (i, a, b)
-    // stack: i-1, a_i_loc_new, b_i_loc_new, retdest
+    // stack: b_i_loc_new, base_addr, i-1, a_i_loc_new, retdest
+    %stack (b, addr, i, a) -> (i, addr, a, b)
+    // stack: i-1, base_addr, a_i_loc_new, b_i_loc_new, retdest
     %jump(ge_loop)
 equal:
-    // stack: i, a_i_loc, b_i_loc, retdest
-    %pop3
+    // stack: i, base_addr, a_i_loc, b_i_loc, retdest
+    %pop4
     // stack: retdest
     PUSH 0
     // stack: 0, retdest
     SWAP1
     JUMP
 greater:
-    // stack: a[i], b[i], i, a_i_loc, b_i_loc, retdest
-    %pop5
+    // stack: a[i], b[i], i, base_addr, a_i_loc, b_i_loc, retdest
+    %pop6
     // stack: retdest
     PUSH 1
     // stack: 1, retdest
     SWAP1
     JUMP
 less:
-    // stack: i, a_i_loc, b_i_loc, retdest
-    %pop3
+    // stack: i, base_addr, a_i_loc, b_i_loc, retdest
+    %pop4
     // stack: retdest
     PUSH 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
     // stack: -1, retdest
diff --git a/evm/src/cpu/kernel/asm/bignum/modmul.asm b/evm/src/cpu/kernel/asm/bignum/modmul.asm
index 8b19d3e102..9735f6108d 100644
--- a/evm/src/cpu/kernel/asm/bignum/modmul.asm
+++ b/evm/src/cpu/kernel/asm/bignum/modmul.asm
@@ -21,28 +21,32 @@ global modmul_bignum:
     // STEP 1:
     // The prover provides x := (a * b) % m, which we store in output_loc.
     
+    %build_current_general_address_no_offset
+
     PUSH 0
-    // stack: i=0, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: i=0, base_addr, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
 modmul_remainder_loop:
-    // stack: i, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: i, base_addr, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     PROVER_INPUT(bignum_modmul)
-    // stack: PI, i, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
-    DUP7
+    // stack: PI, i, base_addr, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    DUP8
     DUP3
     ADD
-    // stack: out_loc[i], PI, i, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
-    %mstore_current_general
-    // stack: i, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: out_loc[i], PI, i, base_addr, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    DUP4 ADD // out_addr_i
+    %swap_mstore
+    // stack: i, base_addr, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     %increment
+    DUP3
     DUP2
-    DUP2
-    // stack: i+1, len, i+1, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: i+1, len, i+1, base_addr, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     SUB // functions as NEQ
-    // stack: i+1!=len, i+1, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: i+1!=len, i+1, base_addr, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     %jumpi(modmul_remainder_loop)
 // end of modmul_remainder_loop
-    // stack: i, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
-    POP
+    // stack: i, base_addr, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    %pop2
+    // stack: len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
 
     // stack: len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
 
@@ -69,28 +73,32 @@ modmul_return_1:
     // stack: len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     %mul_const(2)
     // stack: 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+
+    %build_current_general_address_no_offset
+
     PUSH 0
-    // stack: i=0, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: i=0, base_addr, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
 modmul_quotient_loop:
-    // stack: i, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: i, base_addr, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     PROVER_INPUT(bignum_modmul)
-    // stack: PI, i, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
-    DUP9
+    // stack: PI, i, base_addr, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    DUP10
     DUP3
     ADD
-    // stack: s1[i], PI, i, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
-    %mstore_current_general
-    // stack: i, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: s1[i], PI, i, base_addr, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    DUP4 ADD // s1_addr_i
+    %swap_mstore
+    // stack: i, base_addr, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     %increment
+    DUP3
     DUP2
-    DUP2
-    // stack: i+1, 2*len, i+1, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: i+1, 2*len, i+1, base_addr, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     SUB // functions as NEQ
-    // stack: i+1!=2*len, i+1, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    // stack: i+1!=2*len, i+1, base_addr, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
     %jumpi(modmul_quotient_loop)
 // end of modmul_quotient_loop
-    // stack: i, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
-    %pop2
+    // stack: i, base_addr, 2*len, len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
+    %pop3
     // stack: len, a_loc, b_loc, m_loc, out_loc, s1, s2, s3, retdest
 
     // STEP 4:
@@ -130,33 +138,36 @@ modmul_return_4:
     // STEP 6:
     // Check that x + k * m = a * b.
 
-    // Walk through scratch_2 and scratch_3, checking that they are equal.
-    // stack: n=len, i=s2, j=s3, retdest
+    %build_current_general_address_no_offset
+    // stack: base_addr, n=len, i=s2, j=s3, retdest
 modmul_check_loop:
-    // stack: n, i, j, retdest
-    %stack (l, idx: 2) -> (idx, l, idx)
-    // stack: i, j, n, i, j, retdest
-    %mload_current_general
-    SWAP1
-    %mload_current_general
-    SWAP1
-    // stack: mem[i], mem[j], n, i, j, retdest
+    // stack: base_addr, n, i, j, retdest
+    %stack (addr, l, i, j) -> (j, i, addr, addr, l, i, j)
+    // stack: j, i, base_addr, base_addr, n, i, j, retdest
+    DUP3 ADD // addr_j
+    MLOAD_GENERAL
+    // stack: mem[j], i, base_addr, base_addr, n, i, j, retdest
+    SWAP2
+    ADD // addr_i
+    MLOAD_GENERAL
+    // stack: mem[i], mem[j], base_addr, n, i, j, retdest
     %assert_eq
-    // stack: n, i, j, retdest
-    %decrement
+    // stack: base_addr, n, i, j, retdest
     SWAP1
-    %increment
+    %decrement
+    // stack: n-1, base_addr, i, j, retdest
     SWAP2
     %increment
-    SWAP2
-    SWAP1
-    // stack: n-1, i+1, j+1, retdest
-    DUP1
-    // stack: n-1, n-1, i+1, j+1, retdest
+    // stack: i+1, base_addr, n-1, j, retdest
+    SWAP3
+    %increment
+    // stack: j+1, base_addr, n-1, i+1, retdest
+    %stack (j, addr, n, i) -> (n, addr, n, i, j)
+    // stack: n-1, base_addr, n-1, i+1, j+1, retdest
     %jumpi(modmul_check_loop)
 // end of modmul_check_loop
-    // stack: n-1, i+1, j+1, retdest
-    %pop3
+    // stack: base_addr, n-1, i+1, j+1, retdest
+    %pop4
     // stack: retdest
     JUMP
 
diff --git a/evm/src/cpu/kernel/asm/bignum/mul.asm b/evm/src/cpu/kernel/asm/bignum/mul.asm
index ddb3346d6a..b3269f73a9 100644
--- a/evm/src/cpu/kernel/asm/bignum/mul.asm
+++ b/evm/src/cpu/kernel/asm/bignum/mul.asm
@@ -12,49 +12,54 @@ global mul_bignum:
     // stack: len, len, a_start_loc, b_start_loc, output_loc, retdest
     ISZERO
     %jumpi(len_zero)
-    DUP1
-    // stack: n=len, len, a_start_loc, bi=b_start_loc, output_cur=output_loc, retdest
+    
+    %build_current_general_address_no_offset
+
+    DUP2
+    // stack: n=len, base_addr, len, a_start_loc, bi=b_start_loc, output_cur=output_loc, retdest
 mul_loop:
-    // stack: n, len, a_start_loc, bi, output_cur, retdest
+    // stack: n, base_addr, len, a_start_loc, bi, output_cur, retdest
     PUSH mul_addmul_return
-    // stack: mul_addmul_return, n, len, a_start_loc, bi, output_cur, retdest
-    DUP5
-    // stack: bi, mul_addmul_return, n, len, a_start_loc, bi, output_cur, retdest
-    %mload_current_general
-    // stack: b[i], mul_addmul_return, n, len, a_start_loc, bi, output_cur, retdest, b
-    DUP5
-    // stack: a_start_loc, b[i], mul_addmul_return, n, len, a_start_loc, bi, output_cur, retdest, b
-    DUP8
-    // stack: output_loc, a_start_loc, b[i], mul_addmul_return, n, len, a_start_loc, bi, output_cur, retdest, b
+    // stack: mul_addmul_return, n, base_addr, len, a_start_loc, bi, output_cur, retdest
+    DUP6
+    // stack: bi, mul_addmul_return, n, base_addr, len, a_start_loc, bi, output_cur, retdest
+    DUP4 ADD // bi_addr
+    MLOAD_GENERAL
+    // stack: b[i], mul_addmul_return, n, base_addr, len, a_start_loc, bi, output_cur, retdest
     DUP6
-    // stack: len, output_loc, a_start_loc, b[i], mul_addmul_return, n, len, a_start_loc, bi, output_cur, retdest, b
+    // stack: a_start_loc, b[i], mul_addmul_return, n, base_addr, len, a_start_loc, bi, output_cur, retdest
+    DUP9
+    // stack: output_loc, a_start_loc, b[i], mul_addmul_return, n, base_addr, len, a_start_loc, bi, output_cur, retdest
+    DUP7
+    // stack: len, output_loc, a_start_loc, b[i], mul_addmul_return, n, base_addr, len, a_start_loc, bi, output_cur, retdest
     %jump(addmul_bignum)
 mul_addmul_return:
-    // stack: carry_limb, n, len, a_start_loc, bi, output_cur, retdest
-    DUP6
-    // stack: output_cur, carry_limb, n, len, a_start_loc, bi, output_cur, retdest
-    DUP4
-    // stack: len, output_cur, carry_limb, n, len, a_start_loc, bi, output_cur, retdest
+    // stack: carry_limb, n, base_addr, len, a_start_loc, bi, output_cur, retdest
+    DUP7
+    // stack: output_cur, carry_limb, n, base_addr, len, a_start_loc, bi, output_cur, retdest
+    DUP5
+    // stack: len, output_cur, carry_limb, n, base_addr, len, a_start_loc, bi, output_cur, retdest
     ADD
-    // stack: output_cur + len, carry_limb, n, len, a_start_loc, bi, output_cur, retdest
-    %mstore_current_general
-    // stack: n, len, a_start_loc, bi, output_cur, retdest
+    // stack: output_cur + len, carry_limb, n, base_addr, len, a_start_loc, bi, output_cur, retdest
+    DUP4 ADD
+    %swap_mstore
+    // stack: n, base_addr, len, a_start_loc, bi, output_cur, retdest
     %decrement
-    // stack: n-1, len, a_start_loc, bi, output_cur, retdest
-    SWAP3
-    %increment
-    SWAP3
-    // stack: n-1, len, a_start_loc, bi+1, output_cur, retdest
+    // stack: n-1, base_addr, len, a_start_loc, bi, output_cur, retdest
     SWAP4
     %increment
     SWAP4
-    // stack: n-1, len, a_start_loc, bi+1, output_cur+1, retdest
+    // stack: n-1, base_addr, len, a_start_loc, bi+1, output_cur, retdest
+    SWAP5
+    %increment
+    SWAP5
+    // stack: n-1, base_addr, len, a_start_loc, bi+1, output_cur+1, retdest
     DUP1
-    // stack: n-1, n-1, len, a_start_loc, bi+1, output_cur+1, retdest
+    // stack: n-1, n-1, base_addr, len, a_start_loc, bi+1, output_cur+1, retdest
     %jumpi(mul_loop)
 mul_end:
-    // stack: n-1, len, a_start_loc, bi+1, output_cur+1, retdest
-    %pop5
+    // stack: n-1, base_addr, len, a_start_loc, bi+1, output_cur+1, retdest
+    %pop6
     // stack: retdest
     JUMP
 
diff --git a/evm/src/cpu/kernel/asm/bignum/shr.asm b/evm/src/cpu/kernel/asm/bignum/shr.asm
index adf9577084..88d08f05f2 100644
--- a/evm/src/cpu/kernel/asm/bignum/shr.asm
+++ b/evm/src/cpu/kernel/asm/bignum/shr.asm
@@ -16,48 +16,54 @@ global shr_bignum:
     // stack: start_loc + len, start_loc, retdest
     %decrement
     // stack: end_loc, start_loc, retdest
-    %stack (e) -> (e, 0)
-    // stack: i=end_loc, carry=0, start_loc, retdest
+    
+    %build_current_general_address_no_offset
+
+    // stack: base_addr, end_loc, start_loc, retdest
+    %stack (addr, e) -> (e, addr, 0)
+    // stack: i=end_loc, base_addr, carry=0, start_loc, retdest
 shr_loop:
-    // stack: i, carry, start_loc, retdest
+    // stack: i, base_addr, carry, start_loc, retdest
     DUP1
-    // stack: i, i, carry, start_loc, retdest
-    %mload_current_general
-    // stack: a[i], i, carry, start_loc, retdest
+    // stack: i, i, base_addr, carry, start_loc, retdest
+    DUP3 ADD // addr_i
+    MLOAD_GENERAL
+    // stack: a[i], i, base_addr, carry, start_loc, retdest
     DUP1
-    // stack: a[i], a[i], i, carry, start_loc, retdest
+    // stack: a[i], a[i], i, base_addr, carry, start_loc, retdest
     %shr_const(1)
-    // stack: a[i] >> 1, a[i], i, carry, start_loc, retdest
+    // stack: a[i] >> 1, a[i], i, base_addr, carry, start_loc, retdest
     SWAP1
-    // stack: a[i], a[i] >> 1, i, carry, start_loc, retdest
+    // stack: a[i], a[i] >> 1, i, base_addr, carry, start_loc, retdest
     %mod_const(2)
-    // stack: new_carry = a[i] % 2, a[i] >> 1, i, carry, start_loc, retdest
-    SWAP3
-    // stack: carry, a[i] >> 1, i, new_carry, start_loc, retdest
+    // stack: new_carry = a[i] % 2, a[i] >> 1, i, base_addr, carry, start_loc, retdest
+    SWAP4
+    // stack: carry, a[i] >> 1, i, base_addr, new_carry, start_loc, retdest
     %shl_const(127)
-    // stack: carry << 127, a[i] >> 1, i, new_carry, start_loc, retdest
+    // stack: carry << 127, a[i] >> 1, i, base_addr, new_carry, start_loc, retdest
     ADD
-    // stack: carry << 127 | a[i] >> 1, i, new_carry, start_loc, retdest
+    // stack: carry << 127 | a[i] >> 1, i, base_addr, new_carry, start_loc, retdest
     DUP2
-    // stack: i, carry << 127 | a[i] >> 1, i, new_carry, start_loc, retdest
-    %mstore_current_general
-    // stack: i, new_carry, start_loc, retdest
-    DUP1
-    // stack: i, i, new_carry, start_loc, retdest
-    %decrement
-    // stack: i-1, i, new_carry, start_loc, retdest
+    // stack: i, carry << 127 | a[i] >> 1, i, base_addr, new_carry, start_loc, retdest
+    DUP4 ADD // addr_i
+    %swap_mstore
+    // stack: i, base_addr, new_carry, start_loc, retdest
+    PUSH 1
+    DUP2
+    SUB
+    // stack: i-1, i, base_addr, new_carry, start_loc, retdest
     SWAP1
-    // stack: i, i-1, new_carry, start_loc, retdest
-    DUP4
-    // stack: start_loc, i, i-1, new_carry, start_loc, retdest
+    // stack: i, i-1, base_addr, new_carry, start_loc, retdest
+    DUP5
+    // stack: start_loc, i, i-1, base_addr, new_carry, start_loc, retdest
     EQ
-    // stack: i == start_loc, i-1, new_carry, start_loc, retdest
+    // stack: i == start_loc, i-1, base_addr, new_carry, start_loc, retdest
     ISZERO
-    // stack: i != start_loc, i-1, new_carry, start_loc, retdest
+    // stack: i != start_loc, i-1, base_addr, new_carry, start_loc, retdest
     %jumpi(shr_loop)
 shr_end:
-    // stack: i, new_carry, start_loc, retdest
-    %pop3
+    // stack: i, base_addr, new_carry, start_loc, retdest
+    %pop4
     // stack: retdest
     JUMP
 
diff --git a/evm/src/cpu/kernel/asm/bignum/util.asm b/evm/src/cpu/kernel/asm/bignum/util.asm
index 7bd6e0dc33..f0a1563450 100644
--- a/evm/src/cpu/kernel/asm/bignum/util.asm
+++ b/evm/src/cpu/kernel/asm/bignum/util.asm
@@ -1,15 +1,21 @@
 %macro memcpy_current_general
     // stack: dst, src, len
-    GET_CONTEXT
-    %stack (context, dst, src, len) -> (context, @SEGMENT_KERNEL_GENERAL, dst, context, @SEGMENT_KERNEL_GENERAL, src, len, %%after)
+    // DST and SRC are offsets, for the same memory segment
+    %build_current_general_address_no_offset
+    %stack (addr_no_offset, dst, src, len) -> (addr_no_offset, src, addr_no_offset, dst, len, %%after)
+    ADD
+    // stack: SRC, addr_no_offset, dst, len, %%after
+    SWAP2
+    ADD
+    // stack: DST, SRC, len, %%after
     %jump(memcpy)
 %%after:
 %endmacro
 
 %macro clear_current_general
     // stack: dst, len
-    GET_CONTEXT
-    %stack (context, dst, len) -> (context, @SEGMENT_KERNEL_GENERAL, dst, len, %%after)
+    %build_current_general_address
+    %stack (DST, len) -> (DST, len, %%after)
     %jump(memset)
 %%after:
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/bloom_filter.asm b/evm/src/cpu/kernel/asm/bloom_filter.asm
index d30b3c207b..35a4ebd763 100644
--- a/evm/src/cpu/kernel/asm/bloom_filter.asm
+++ b/evm/src/cpu/kernel/asm/bloom_filter.asm
@@ -55,20 +55,21 @@ logs_bloom_loop:
     // Add address to bloom filter.
     %increment
     // stack: addr_ptr, i, logs_len, retdest
+    PUSH @SEGMENT_LOGS_DATA %build_kernel_address
     DUP1
-    %mload_kernel(@SEGMENT_LOGS_DATA)
-    // stack: addr, addr_ptr, i, logs_len, retdest
+    MLOAD_GENERAL
+    // stack: addr, full_addr_ptr, i, logs_len, retdest
     PUSH 0
-    // stack: is_topic, addr, addr_ptr, i, logs_len, retdest
+    // stack: is_topic, addr, full_addr_ptr, i, logs_len, retdest
     %add_to_bloom
-    // stack: addr_ptr, i, logs_len, retdest
+    // stack: full_addr_ptr, i, logs_len, retdest
     %increment
-    // stack: num_topics_ptr, i, logs_len, retdest
+    // stack: full_num_topics_ptr, i, logs_len, retdest
     DUP1
-    %mload_kernel(@SEGMENT_LOGS_DATA)
-    // stack: num_topics, num_topics_ptr, i, logs_len, retdest
+    MLOAD_GENERAL
+    // stack: num_topics, full_num_topics_ptr, i, logs_len, retdest
     SWAP1 %increment
-    // stack: topics_ptr, num_topics, i, logs_len, retdest
+    // stack: full_topics_ptr, num_topics, i, logs_len, retdest
     PUSH 0
 
 logs_bloom_topic_loop:
@@ -78,7 +79,7 @@ logs_bloom_topic_loop:
     %jumpi(logs_bloom_topic_end)
     DUP2 DUP2 ADD
     // stack: curr_topic_ptr, j, topics_ptr, num_topics, i, logs_len, retdest
-    %mload_kernel(@SEGMENT_LOGS_DATA)
+    MLOAD_GENERAL
     // stack: topic, j, topics_ptr, num_topics, i, logs_len, retdest
     PUSH 1
     // stack: is_topic, topic, j, topics_ptr, num_topics, i, logs_len, retdest
@@ -142,31 +143,20 @@ logs_bloom_end:
 // Also updates the block bloom filter.
 %macro bloom_write_bit
     // stack: byte_index, byte_bit_index
-    DUP2 
-    // stack: byte_bit_index, byte_index, byte_bit_index
+    PUSH @SEGMENT_TXN_BLOOM
+    %build_kernel_address
+    PUSH 1
+    DUP3
+    // stack: byte_bit_index, 1, byte_addr, byte_bit_index
     PUSH 7 SUB
-    PUSH 1 SWAP1 SHL
+    SHL
     // Updates the current txn bloom filter.
-    // stack: one_shifted_by_index, byte_index, byte_bit_index
-    DUP2 DUP1
-    // stack: byte_index, byte_index, one_shifted_by_index, byte_index, byte_bit_index
-    // load bloom_byte from current txn bloom filter
-    %mload_kernel(@SEGMENT_TXN_BLOOM)
-    %stack (old_bloom_byte, byte_index, one_shifted_by_index) -> (old_bloom_byte, one_shifted_by_index, byte_index, one_shifted_by_index)
-    OR
-    // stack: new_bloom_byte, byte_index, one_shifted_by_index, byte_index, byte_bit_index
-    SWAP1
-    %mstore_kernel(@SEGMENT_TXN_BLOOM)
-    // stack: one_shifted_by_index, byte_index, byte_bit_index
-
-    // Updates the block bloom filter.
     SWAP2 POP DUP1
-    %mload_kernel(@SEGMENT_BLOCK_BLOOM)
-    // stack: old_bloom_byte, byte_index, one_shifted_by_index
+    MLOAD_GENERAL
+    // stack: old_bloom_byte, byte_addr, one_shifted_by_index
     DUP3 OR
-    // stack: new_bloom_byte, byte_index, one_shifted_by_index
-    SWAP1
-    %mstore_kernel(@SEGMENT_BLOCK_BLOOM)
+    // stack: new_bloom_byte, byte_addr, one_shifted_by_index
+    MSTORE_GENERAL
     // stack: one_shifted_by_index
     POP
     // stack: empty
diff --git a/evm/src/cpu/kernel/asm/core/access_lists.asm b/evm/src/cpu/kernel/asm/core/access_lists.asm
index b1b9fd5d5e..30afe27c41 100644
--- a/evm/src/cpu/kernel/asm/core/access_lists.asm
+++ b/evm/src/cpu/kernel/asm/core/access_lists.asm
@@ -25,12 +25,15 @@ global insert_accessed_addresses:
     // stack: addr, retdest
     %mload_global_metadata(@GLOBAL_METADATA_ACCESSED_ADDRESSES_LEN)
     // stack: len, addr, retdest
-    PUSH 0
+    PUSH @SEGMENT_ACCESSED_ADDRESSES ADD
+    PUSH @SEGMENT_ACCESSED_ADDRESSES
 insert_accessed_addresses_loop:
+    // `i` and `len` are both scaled by SEGMENT_ACCESSED_ADDRESSES
     %stack (i, len, addr, retdest) -> (i, len, i, len, addr, retdest)
     EQ %jumpi(insert_address)
     // stack: i, len, addr, retdest
-    DUP1 %mload_kernel(@SEGMENT_ACCESSED_ADDRESSES)
+    DUP1
+    MLOAD_GENERAL
     // stack: loaded_addr, i, len, addr, retdest
     DUP4
     // stack: addr, loaded_addr, i, len, addr, retdest
@@ -42,9 +45,10 @@ insert_accessed_addresses_loop:
 insert_address:
     %stack (i, len, addr, retdest) -> (i, addr, len, retdest)
     DUP2 %journal_add_account_loaded // Add a journal entry for the loaded account.
-    %mstore_kernel(@SEGMENT_ACCESSED_ADDRESSES) // Store new address at the end of the array.
+    %swap_mstore // Store new address at the end of the array.
     // stack: len, retdest
     %increment
+    %sub_const(@SEGMENT_ACCESSED_ADDRESSES) // unscale `len`
     %mstore_global_metadata(@GLOBAL_METADATA_ACCESSED_ADDRESSES_LEN) // Store new length.
     PUSH 1 // Return 1 to indicate that the address was inserted.
     SWAP1 JUMP
@@ -59,12 +63,14 @@ global remove_accessed_addresses:
     // stack: addr, retdest
     %mload_global_metadata(@GLOBAL_METADATA_ACCESSED_ADDRESSES_LEN)
     // stack: len, addr, retdest
-    PUSH 0
+    PUSH @SEGMENT_ACCESSED_ADDRESSES ADD
+    PUSH @SEGMENT_ACCESSED_ADDRESSES
 remove_accessed_addresses_loop:
+    // `i` and `len` are both scaled by SEGMENT_ACCESSED_ADDRESSES
     %stack (i, len, addr, retdest) -> (i, len, i, len, addr, retdest)
     EQ %jumpi(panic)
     // stack: i, len, addr, retdest
-    DUP1 %mload_kernel(@SEGMENT_ACCESSED_ADDRESSES)
+    DUP1 MLOAD_GENERAL
     // stack: loaded_addr, i, len, addr, retdest
     DUP4
     // stack: addr, loaded_addr, i, len, addr, retdest
@@ -74,12 +80,15 @@ remove_accessed_addresses_loop:
     %jump(remove_accessed_addresses_loop)
 remove_accessed_addresses_found:
     %stack (i, len, addr, retdest) -> (len, 1, i, retdest)
-    SUB DUP1 %mstore_global_metadata(@GLOBAL_METADATA_ACCESSED_ADDRESSES_LEN) // Decrement the access list length.
+    SUB  // len -= 1
+    PUSH @SEGMENT_ACCESSED_ADDRESSES
+    DUP2 SUB // unscale `len`
+    %mstore_global_metadata(@GLOBAL_METADATA_ACCESSED_ADDRESSES_LEN) // Decrement the access list length.
     // stack: len-1, i, retdest
-    %mload_kernel(@SEGMENT_ACCESSED_ADDRESSES) // Load the last address in the access list.
+    MLOAD_GENERAL // Load the last address in the access list.
     // stack: last_addr, i, retdest
-    SWAP1
-    %mstore_kernel(@SEGMENT_ACCESSED_ADDRESSES) // Store the last address at the position of the removed address.
+    MSTORE_GENERAL
+    // Store the last address at the position of the removed address.
     JUMP
 
 
@@ -97,14 +106,16 @@ global insert_accessed_storage_keys:
     // stack: addr, key, value, retdest
     %mload_global_metadata(@GLOBAL_METADATA_ACCESSED_STORAGE_KEYS_LEN)
     // stack: len, addr, key, value, retdest
-    PUSH 0
+    PUSH @SEGMENT_ACCESSED_STORAGE_KEYS ADD
+    PUSH @SEGMENT_ACCESSED_STORAGE_KEYS
 insert_accessed_storage_keys_loop:
+    // `i` and `len` are both scaled by SEGMENT_ACCESSED_STORAGE_KEYS
     %stack (i, len, addr, key, value, retdest) -> (i, len, i, len, addr, key, value, retdest)
     EQ %jumpi(insert_storage_key)
     // stack: i, len, addr, key, value, retdest
-    DUP1 %increment %mload_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP1 %increment MLOAD_GENERAL
     // stack: loaded_key, i, len, addr, key, value, retdest
-    DUP2 %mload_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP2 MLOAD_GENERAL
     // stack: loaded_addr, loaded_key, i, len, addr, key, value, retdest
     DUP5 EQ
     // stack: loaded_addr==addr, loaded_key, i, len, addr, key, value, retdest
@@ -120,14 +131,18 @@ insert_storage_key:
     // stack: i, len, addr, key, value, retdest
     DUP4 DUP4 %journal_add_storage_loaded // Add a journal entry for the loaded storage key.
     // stack: i, len, addr, key, value, retdest
-    DUP1 %increment
-    DUP1 %increment
-    %stack (i_plus_2, i_plus_1, i, len, addr, key, value) -> (i, addr, i_plus_1, key, i_plus_2, value, i_plus_2, value)
-    %mstore_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS) // Store new address at the end of the array.
-    %mstore_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS) // Store new key after that
-    %mstore_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS) // Store new value after that
-    // stack: i_plus_2, value, retdest
-    %increment
+
+    %stack(dst, len, addr, key, value) -> (addr, dst, dst, key, dst, value, dst, @SEGMENT_ACCESSED_STORAGE_KEYS, value)
+    MSTORE_GENERAL // Store new address at the end of the array.
+    // stack: dst, key, dst, value, dst, segment, value, retdest
+    %increment SWAP1
+    MSTORE_GENERAL // Store new key after that
+    // stack: dst, value, dst, segment, value, retdest
+    %add_const(2) SWAP1
+    MSTORE_GENERAL // Store new value after that
+    // stack: dst, segment, value, retdest
+    %add_const(3)
+    SUB // unscale dst
     %mstore_global_metadata(@GLOBAL_METADATA_ACCESSED_STORAGE_KEYS_LEN) // Store new length.
     %stack (value, retdest) -> (retdest, 1, value) // Return 1 to indicate that the storage key was inserted.
     JUMP
@@ -135,7 +150,7 @@ insert_storage_key:
 insert_accessed_storage_keys_found:
     // stack: i, len, addr, key, value, retdest
     %add_const(2)
-    %mload_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    MLOAD_GENERAL
     %stack (original_value, len, addr, key, value, retdest) -> (retdest, 0, original_value) // Return 0 to indicate that the storage key was already present.
     JUMP
 
@@ -145,14 +160,16 @@ global remove_accessed_storage_keys:
     // stack: addr, key, retdest
     %mload_global_metadata(@GLOBAL_METADATA_ACCESSED_STORAGE_KEYS_LEN)
     // stack: len, addr, key, retdest
-    PUSH 0
+    PUSH @SEGMENT_ACCESSED_STORAGE_KEYS ADD
+    PUSH @SEGMENT_ACCESSED_STORAGE_KEYS
 remove_accessed_storage_keys_loop:
+    // `i` and `len` are both scaled by SEGMENT_ACCESSED_STORAGE_KEYS
     %stack (i, len, addr, key, retdest) -> (i, len, i, len, addr, key, retdest)
     EQ %jumpi(panic)
     // stack: i, len, addr, key, retdest
-    DUP1 %increment %mload_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP1 %increment MLOAD_GENERAL
     // stack: loaded_key, i, len, addr, key, retdest
-    DUP2 %mload_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP2 MLOAD_GENERAL
     // stack: loaded_addr, loaded_key, i, len, addr, key, retdest
     DUP5 EQ
     // stack: loaded_addr==addr, loaded_key, i, len, addr, key, retdest
@@ -166,18 +183,21 @@ remove_accessed_storage_keys_loop:
 
 remove_accessed_storage_keys_found:
     %stack (i, len, addr, key, retdest) -> (len, 3, i, retdest)
-    SUB DUP1 %mstore_global_metadata(@GLOBAL_METADATA_ACCESSED_STORAGE_KEYS_LEN) // Decrease the access list length.
+    SUB 
+    PUSH @SEGMENT_ACCESSED_STORAGE_KEYS
+    DUP2 SUB // unscale
+    %mstore_global_metadata(@GLOBAL_METADATA_ACCESSED_STORAGE_KEYS_LEN) // Decrease the access list length.
     // stack: len-3, i, retdest
-    DUP1 %add_const(2) %mload_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP1 %add_const(2) MLOAD_GENERAL
     // stack: last_value, len-3, i, retdest
-    DUP2 %add_const(1) %mload_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP2 %add_const(1) MLOAD_GENERAL
     // stack: last_key, last_value, len-3, i, retdest
-    DUP3 %mload_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP3 MLOAD_GENERAL
     // stack: last_addr, last_key, last_value, len-3, i, retdest
-    DUP5 %mstore_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS) // Move the last tuple to the position of the removed tuple.
+    DUP5 %swap_mstore // Move the last tuple to the position of the removed tuple.
     // stack: last_key, last_value, len-3, i, retdest
-    DUP4 %add_const(1) %mstore_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP4 %add_const(1) %swap_mstore
     // stack: last_value, len-3, i, retdest
-    DUP3 %add_const(2) %mstore_kernel(@SEGMENT_ACCESSED_STORAGE_KEYS)
+    DUP3 %add_const(2) %swap_mstore
     // stack: len-3, i, retdest
     %pop2 JUMP
diff --git a/evm/src/cpu/kernel/asm/core/call.asm b/evm/src/cpu/kernel/asm/core/call.asm
index af5ab3196c..b5b8935471 100644
--- a/evm/src/cpu/kernel/asm/core/call.asm
+++ b/evm/src/cpu/kernel/asm/core/call.asm
@@ -1,4 +1,5 @@
 // Handlers for call-like operations, namely CALL, CALLCODE, STATICCALL and DELEGATECALL.
+// Reminder: All context metadata hardcoded offsets are already scaled by `Segment::ContextMetadata`.
 
 // Creates a new sub context and executes the code of the given account.
 global sys_call:
@@ -271,7 +272,10 @@ call_too_deep:
 // because it will already be 0 by default.
 %macro set_static_true
     // stack: new_ctx
-    %stack (new_ctx) -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_STATIC, 1, new_ctx)
+    DUP1
+    %build_address_with_ctx_no_segment(@CTX_METADATA_STATIC)
+    PUSH 1
+    // stack: 1, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
@@ -279,81 +283,97 @@ call_too_deep:
 // Set @CTX_METADATA_STATIC of the next context to the current value.
 %macro set_static
     // stack: new_ctx
+    DUP1
+    %build_address_with_ctx_no_segment(@CTX_METADATA_STATIC)
     %mload_context_metadata(@CTX_METADATA_STATIC)
-    %stack (is_static, new_ctx) -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_STATIC, is_static, new_ctx)
+    // stack: is_static, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_addr
     // stack: called_addr, new_ctx
-    %stack (called_addr, new_ctx)
-        -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_ADDRESS, called_addr, new_ctx)
+    DUP2
+    %build_address_with_ctx_no_segment(@CTX_METADATA_ADDRESS)
+    SWAP1
+    // stack: called_addr, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_caller
     // stack: sender, new_ctx
-    %stack (sender, new_ctx)
-        -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_CALLER, sender, new_ctx)
+    DUP2
+    %build_address_with_ctx_no_segment(@CTX_METADATA_CALLER)
+    SWAP1
+    // stack: sender, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_value
     // stack: value, new_ctx
-    %stack (value, new_ctx)
-        -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_CALL_VALUE, value, new_ctx)
+    DUP2
+    %build_address_with_ctx_no_segment(@CTX_METADATA_CALL_VALUE)
+    SWAP1
+    // stack: value, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_code_size
     // stack: code_size, new_ctx
-    %stack (code_size, new_ctx)
-        -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_CODE_SIZE, code_size, new_ctx)
+    DUP2
+    %build_address_with_ctx_no_segment(@CTX_METADATA_CODE_SIZE)
+    SWAP1
+    // stack: code_size, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_calldata_size
     // stack: calldata_size, new_ctx
-    %stack (calldata_size, new_ctx)
-        -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_CALLDATA_SIZE, calldata_size, new_ctx)
+    DUP2
+    %build_address_with_ctx_no_segment(@CTX_METADATA_CALLDATA_SIZE)
+    SWAP1
+    // stack: calldata_size, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_gas_limit
     // stack: gas_limit, new_ctx
-    %stack (gas_limit, new_ctx)
-        -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_GAS_LIMIT, gas_limit, new_ctx)
+    DUP2
+    %build_address_with_ctx_no_segment(@CTX_METADATA_GAS_LIMIT)
+    SWAP1
+    // stack: gas_limit, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_parent_ctx
     // stack: new_ctx
+    DUP1
+    %build_address_with_ctx_no_segment(@CTX_METADATA_PARENT_CONTEXT)
     GET_CONTEXT
-    PUSH @CTX_METADATA_PARENT_CONTEXT
-    PUSH @SEGMENT_CONTEXT_METADATA
-    DUP4 // new_ctx
+    // stack: ctx, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_parent_pc(label)
     // stack: new_ctx
-    %stack (new_ctx)
-        -> (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_PARENT_PC, $label, new_ctx)
+    DUP1
+    %build_address_with_ctx_no_segment(@CTX_METADATA_PARENT_PC)
+    PUSH $label
+    // stack: label, addr, new_ctx
     MSTORE_GENERAL
     // stack: new_ctx
 %endmacro
 
 %macro set_new_ctx_code
-    %stack (address, new_ctx) -> (address, new_ctx, @SEGMENT_CODE, %%after, new_ctx)
-    %jump(load_code)
+    %stack (address, new_ctx) -> (address, new_ctx, %%after, new_ctx)
+    %jump(load_code_padded)
 %%after:
     %set_new_ctx_code_size
     // stack: new_ctx
@@ -367,12 +387,10 @@ call_too_deep:
     %checkpoint // Checkpoint
     %increment_call_depth
     // Perform jumpdest analyis
-    PUSH %%after
     %mload_context_metadata(@CTX_METADATA_CODE_SIZE)
     GET_CONTEXT
     // stack: ctx, code_size, retdest
-    %jump(jumpdest_analysis)
-%%after:
+    %jumpdest_analysis
     PUSH 0 // jump dest
     EXIT_KERNEL
     // (Old context) stack: new_ctx
@@ -381,17 +399,18 @@ call_too_deep:
 %macro copy_mem_to_calldata
     // stack: new_ctx, args_offset, args_size
     GET_CONTEXT
-    %stack (ctx, new_ctx, args_offset, args_size) ->
-        (
-            new_ctx, @SEGMENT_CALLDATA, 0,          // DST
-            ctx, @SEGMENT_MAIN_MEMORY, args_offset, // SRC
-            args_size, %%after,                     // count, retdest
-            new_ctx, args_size
-        )
-    %jump(memcpy)
+    %stack(ctx, new_ctx, args_offset, args_size) -> (ctx, @SEGMENT_MAIN_MEMORY, args_offset, args_size, %%after, new_ctx, args_size)
+    %build_address
+    // stack: SRC, args_size, %%after, new_ctx, args_size
+    DUP4
+    %build_address_with_ctx_no_offset(@SEGMENT_CALLDATA)
+    // stack: DST, SRC, args_size, %%after, new_ctx, args_size
+    %jump(memcpy_bytes)
 %%after:
-    %stack (new_ctx, args_size) ->
-        (new_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_CALLDATA_SIZE, args_size)
+    // stack: new_ctx, args_size
+    %build_address_with_ctx_no_segment(@CTX_METADATA_CALLDATA_SIZE)
+    // stack: addr, args_size
+    SWAP1
     MSTORE_GENERAL
     // stack: (empty)
 %endmacro
@@ -403,14 +422,13 @@ call_too_deep:
     // stack: returndata_size, ret_size, new_ctx, success, ret_offset, kexit_info
     %min
     GET_CONTEXT
-    %stack (ctx, n, new_ctx, success, ret_offset, kexit_info) ->
-        (
-            ctx, @SEGMENT_MAIN_MEMORY, ret_offset, // DST
-            ctx, @SEGMENT_RETURNDATA, 0,           // SRC
-            n, %%after,                     // count, retdest
-            kexit_info, success
-        )
-    %jump(memcpy)
+    %stack (ctx, n, new_ctx, success, ret_offset, kexit_info) -> (ctx, @SEGMENT_RETURNDATA, @SEGMENT_MAIN_MEMORY, ret_offset, ctx, n, %%after, kexit_info, success)
+    %build_address_no_offset
+    // stack: SRC, @SEGMENT_MAIN_MEMORY, ret_offset, ctx, n, %%after, kexit_info, success
+    SWAP3
+    %build_address
+    // stack: DST, SRC, n, %%after, kexit_info, success
+    %jump(memcpy_bytes)
 %%after:
 %endmacro
 
diff --git a/evm/src/cpu/kernel/asm/core/call_gas.asm b/evm/src/cpu/kernel/asm/core/call_gas.asm
index 69e2796661..3961352139 100644
--- a/evm/src/cpu/kernel/asm/core/call_gas.asm
+++ b/evm/src/cpu/kernel/asm/core/call_gas.asm
@@ -9,7 +9,7 @@
 // Charge gas for *call opcodes and return the sub-context gas limit.
 // Doesn't include memory expansion costs.
 global call_charge_gas:
-    // Compute C_aaccess
+    // Compute C_access
     // stack: is_call_or_callcode, is_call_or_staticcall, cold_access, address, gas, kexit_info, value, retdest
     SWAP2
     // stack: cold_access, is_call_or_staticcall, is_call_or_callcode, address, gas, kexit_info, value, retdest
diff --git a/evm/src/cpu/kernel/asm/core/create.asm b/evm/src/cpu/kernel/asm/core/create.asm
index ddaf96de03..80f8f46188 100644
--- a/evm/src/cpu/kernel/asm/core/create.asm
+++ b/evm/src/cpu/kernel/asm/core/create.asm
@@ -57,6 +57,7 @@ global sys_create2:
     DUP5 // code_offset
     PUSH @SEGMENT_MAIN_MEMORY
     GET_CONTEXT
+    %build_address
     KECCAK_GENERAL
     // stack: hash, salt, create_common, value, code_offset, code_len, kexit_info
 
@@ -99,12 +100,16 @@ global create_common:
     %set_new_ctx_code_size POP
     // Copy the code from memory to the new context's code segment.
     %stack (src_ctx, new_ctx, address, value, code_offset, code_len)
-        -> (new_ctx, @SEGMENT_CODE, 0, // DST
-            src_ctx, @SEGMENT_MAIN_MEMORY, code_offset, // SRC
+        -> (src_ctx, @SEGMENT_MAIN_MEMORY, code_offset, // SRC
+            new_ctx, // DST (SEGMENT_CODE == virt == 0)
             code_len,
             run_constructor,
             new_ctx, value, address)
-    %jump(memcpy)
+    %build_address
+    // stack: SRC, DST, code_len, run_constructor, new_ctx, value, address
+    SWAP1
+    // stack: DST, SRC, code_len, run_constructor, new_ctx, value, address
+    %jump(memcpy_bytes)
 
 run_constructor:
     // stack: new_ctx, value, address, kexit_info
@@ -144,7 +149,11 @@ after_constructor:
     POP
 
     // EIP-3541: Reject new contract code starting with the 0xEF byte
-    PUSH 0 %mload_current(@SEGMENT_RETURNDATA) %eq_const(0xEF) %jumpi(create_first_byte_ef)
+    PUSH @SEGMENT_RETURNDATA
+    GET_CONTEXT
+    %build_address_no_offset
+    MLOAD_GENERAL
+    %eq_const(0xEF) %jumpi(create_first_byte_ef)
 
     // Charge gas for the code size.
     // stack: leftover_gas, success, address, kexit_info
@@ -160,9 +169,9 @@ after_constructor:
     %pop_checkpoint
 
     // Store the code hash of the new contract.
-    GET_CONTEXT
     %returndatasize
-    %stack (size, ctx) -> (ctx, @SEGMENT_RETURNDATA, 0, size) // context, segment, offset, len
+    PUSH @SEGMENT_RETURNDATA GET_CONTEXT %build_address_no_offset
+    // stack: addr, len
     KECCAK_GENERAL
     // stack: codehash, leftover_gas, success, address, kexit_info
     %observe_new_contract
diff --git a/evm/src/cpu/kernel/asm/core/create_addresses.asm b/evm/src/cpu/kernel/asm/core/create_addresses.asm
index 70f57b6f0b..8c2de08bd2 100644
--- a/evm/src/cpu/kernel/asm/core/create_addresses.asm
+++ b/evm/src/cpu/kernel/asm/core/create_addresses.asm
@@ -14,10 +14,7 @@ global get_create_address:
     %encode_rlp_scalar
     // stack: rlp_pos, rlp_start, retdest
     %prepend_rlp_list_prefix
-    // stack: rlp_prefix_start, rlp_len, retdest
-    PUSH @SEGMENT_RLP_RAW
-    PUSH 0 // context
-    // stack: RLP_ADDR: 3, rlp_len, retdest
+    // stack: RLP_ADDR, rlp_len, retdest
     KECCAK_GENERAL
     // stack: hash, retdest
     %u256_to_addr
@@ -40,20 +37,21 @@ global get_create_address:
 // Post stack: address
 global get_create2_address:
     // stack: sender, code_hash, salt, retdest
-    PUSH 0xff PUSH 0 %mstore_kernel_general
-    %stack (sender, code_hash, salt, retdest) -> (0, @SEGMENT_KERNEL_GENERAL, 1, sender, 20, get_create2_address_contd, salt, code_hash, retdest)
-    %jump(mstore_unpacking)
-get_create2_address_contd:
+    PUSH @SEGMENT_KERNEL_GENERAL
+    DUP1
+    PUSH 0xff
+    MSTORE_GENERAL
+    // stack: addr, sender, code_hash, salt, retdest
+    %increment
+    %stack (addr, sender, code_hash, salt, retdest) -> (addr, sender, salt, code_hash, retdest)
+    MSTORE_32BYTES_20
+    // stack: addr, salt, code_hash, retdest
+    MSTORE_32BYTES_32
+    // stack: addr, code_hash, retdest
+    MSTORE_32BYTES_32
     POP
-    %stack (salt, code_hash, retdest) -> (0, @SEGMENT_KERNEL_GENERAL, 21, salt, 32, get_create2_address_contd2, code_hash, retdest)
-    %jump(mstore_unpacking)
-get_create2_address_contd2:
-    POP
-    %stack (code_hash, retdest) -> (0, @SEGMENT_KERNEL_GENERAL, 53, code_hash, 32, get_create2_address_finish, retdest)
-    %jump(mstore_unpacking)
-get_create2_address_finish:
-    POP
-    %stack (retdest) -> (0, @SEGMENT_KERNEL_GENERAL, 0, 85, retdest) // context, segment, offset, len
+    %stack (retdest) -> (@SEGMENT_KERNEL_GENERAL, 85, retdest) // offset == context == 0
+    // addr, len, retdest
     KECCAK_GENERAL
     // stack: hash, retdest
     %u256_to_addr
diff --git a/evm/src/cpu/kernel/asm/core/create_receipt.asm b/evm/src/cpu/kernel/asm/core/create_receipt.asm
index ec9b1fbd21..60e9264739 100644
--- a/evm/src/cpu/kernel/asm/core/create_receipt.asm
+++ b/evm/src/cpu/kernel/asm/core/create_receipt.asm
@@ -55,8 +55,8 @@ process_receipt_after_bloom:
     %get_trie_data_size
     // stack: receipt_ptr, payload_len, status, new_cum_gas, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     // Write transaction type if necessary. RLP_RAW contains, at index 0, the current transaction type.
-    PUSH 0
-    %mload_kernel(@SEGMENT_RLP_RAW)
+    PUSH @SEGMENT_RLP_RAW // ctx == virt == 0
+    MLOAD_GENERAL
     // stack: first_txn_byte, receipt_ptr, payload_len, status, new_cum_gas, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     DUP1 %eq_const(1) %jumpi(receipt_nonzero_type)
     DUP1 %eq_const(2) %jumpi(receipt_nonzero_type)
@@ -79,10 +79,12 @@ process_receipt_after_type:
     // stack: receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     // Write Bloom filter.
     PUSH 256 // Bloom length.
-    PUSH 0 PUSH @SEGMENT_TXN_BLOOM PUSH 0 // Bloom memory address.
-    %get_trie_data_size PUSH @SEGMENT_TRIE_DATA PUSH 0 // MPT dest address.
+    PUSH @SEGMENT_TXN_BLOOM // ctx == virt == 0
+    // stack: bloom_addr, 256, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
+    %get_trie_data_size
+    PUSH @SEGMENT_TRIE_DATA ADD // MPT dest address.
     // stack: DST, SRC, 256, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
-    %memcpy
+    %memcpy_bytes
     // stack: receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     // Update trie data size.
     %get_trie_data_size
@@ -114,22 +116,23 @@ process_receipt_logs_loop:
     %mload_kernel(@SEGMENT_LOGS)
     // stack: log_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     // Write payload_len.
+    PUSH @SEGMENT_LOGS_DATA %build_kernel_address
     DUP1
-    %mload_kernel(@SEGMENT_LOGS_DATA)
+    MLOAD_GENERAL
     %append_to_trie_data
     // stack: log_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     // Write address.
     %increment
     // stack: addr_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     DUP1
-    %mload_kernel(@SEGMENT_LOGS_DATA)
+    MLOAD_GENERAL
     %append_to_trie_data
     // stack: addr_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     //Write num_topics.
     %increment
     // stack: num_topics_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     DUP1
-    %mload_kernel(@SEGMENT_LOGS_DATA)
+    MLOAD_GENERAL
     // stack: num_topics, num_topics_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     DUP1
     %append_to_trie_data
@@ -149,7 +152,7 @@ process_receipt_topics_loop:
     DUP3 DUP2
     ADD
     // stack: cur_topic_ptr, j, num_topics, topics_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
-    %mload_kernel(@SEGMENT_LOGS_DATA)
+    MLOAD_GENERAL
     %append_to_trie_data
     // stack: j, num_topics, topics_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     %increment
@@ -162,7 +165,7 @@ process_receipt_topics_end:
     // stack: data_len_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     // Write data_len
     DUP1
-    %mload_kernel(@SEGMENT_LOGS_DATA)
+    MLOAD_GENERAL
     // stack: data_len, data_len_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     DUP1
     %append_to_trie_data
@@ -182,7 +185,7 @@ process_receipt_data_loop:
     DUP3 DUP2
     ADD
     // stack: cur_data_ptr, j, data_len, data_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
-    %mload_kernel(@SEGMENT_LOGS_DATA)
+    MLOAD_GENERAL
     %append_to_trie_data
     // stack: j, data_len, data_ptr, i, num_logs, receipt_ptr, txn_nb, new_cum_gas, txn_nb, num_nibbles, retdest
     %increment
@@ -203,24 +206,10 @@ process_receipt_after_write:
     DUP5
     %mpt_insert_receipt_trie
     // stack: new_cum_gas, txn_nb, num_nibbles, retdest
-    // Now, we set the Bloom filter back to 0. We proceed by chunks of 32 bytes.
-    PUSH 32
-    PUSH 0
-    %rep 8
-        // stack: counter, 32, new_cum_gas, txn_nb, num_nibbles, retdest
-        DUP2
-        PUSH 0 // we will fill the memory segment with zeroes
-        DUP2
-        PUSH @SEGMENT_TXN_BLOOM
-        DUP3 // kernel context is 0
-        // stack: ctx, segment, counter, 0, 32, counter, 32, new_cum_gas, txn_nb, num_nibbles, retdest
-        MSTORE_32BYTES
-        // stack: counter, 32, new_cum_gas, txn_nb, num_nibbles, retdest
-        DUP2
-        ADD
-    %endrep
-    %pop2
-    // stack: new_cum_gas, txn_nb, num_nibbles, retdest
+
+    // We don't need to reset the bloom filter segment as we only process a single transaction.
+    // TODO: Revert in case we add back support for multi-txn proofs.
+
     %stack (new_cum_gas, txn_nb, num_nibbles, retdest) -> (retdest, new_cum_gas)
     JUMP
     
diff --git a/evm/src/cpu/kernel/asm/core/exception.asm b/evm/src/cpu/kernel/asm/core/exception.asm
index 05d8a25fed..80bf7dbdf6 100644
--- a/evm/src/cpu/kernel/asm/core/exception.asm
+++ b/evm/src/cpu/kernel/asm/core/exception.asm
@@ -1,4 +1,6 @@
-// These exception codes are arbitary and assigned by us.
+// These exception codes are arbitrary and assigned by us.
+// Note that exceptions can only be triggered in user mode. Triggering an exception
+// in kernel mode wwill fail the constraints.
 global exception_jumptable:
     // exception 0: out of gas
     JUMPTABLE exc_out_of_gas
@@ -24,8 +26,31 @@ global exception_jumptable:
 
 
 global exc_out_of_gas:
-    // TODO
-    %jump(fault_exception)
+    // stack: trap_info
+    %ctx_gas_limit
+    // stack: gas_limit, trap_info
+    DUP2 %shr_const(192)
+    // stack: gas_used, gas_limit, trap_info
+    DUP2 DUP2
+    // stack: gas_used, gas_limit, gas_used, gas_limit, trap_info
+    // If gas_used is already over the limit, panic. The exception should have
+    // been raised earlier.
+    GT %jumpi(panic)
+    // stack: gas_used, gas_limit, trap_info
+    DUP3 %opcode_from_exp_trap_info
+    // stack: opcode, gas_used, gas_limit, trap_info
+    %add_const(gas_cost_for_opcode)
+    %mload_kernel_code
+    // stack: gas_cost, gas_used, gas_limit, trap_info
+    ADD
+    // stack: new_gas_used, gas_limit, trap_info
+    GT
+    // stack: is_oog, trap_info
+    SWAP1 POP
+    // stack: is_oog
+    %jumpi(fault_exception)
+    // If we didn't jump, we shouldn't have raised the exception.
+    PANIC
 
 
 global exc_invalid_opcode:
@@ -276,11 +301,16 @@ min_stack_len_for_opcode:
     BYTES 4  // 0xa2, LOG2
     BYTES 5  // 0xa3, LOG3
     BYTES 6  // 0xa4, LOG4
-    %rep 11 // 0xa5-0xaf, invalid
+
+    %rep 27 // 0xa5-0xbf, invalid
         BYTES 0
     %endrep
 
-    %rep 64 // 0xb0-0xef, invalid
+    %rep 32 // 0xc0-0xdf, MSTORE_32BYTES
+        BYTES 4
+    %endrep
+    
+    %rep 16 // 0xe0-0xef, invalid
         BYTES 0
     %endrep
 
@@ -299,3 +329,110 @@ min_stack_len_for_opcode:
     BYTES 2  // 0xfd, REVERT
     BYTES 0  // 0xfe, invalid
     BYTES 1  // 0xff, SELFDESTRUCT
+
+// A zero indicates either that the opcode is kernel-only,
+// or that it's handled with a syscall.
+gas_cost_for_opcode:
+    BYTES 0  // 0x00, STOP
+    BYTES @GAS_VERYLOW  // 0x01, ADD
+    BYTES @GAS_LOW  // 0x02, MUL
+    BYTES @GAS_VERYLOW  // 0x03, SUB
+    BYTES @GAS_LOW  // 0x04, DIV
+    BYTES @GAS_LOW  // 0x05, SDIV
+    BYTES @GAS_LOW  // 0x06, MOD
+    BYTES @GAS_LOW  // 0x07, SMOD
+    BYTES @GAS_MID  // 0x08, ADDMOD
+    BYTES @GAS_MID  // 0x09, MULMOD
+    BYTES 0  // 0x0a, EXP
+    BYTES 0  // 0x0b, SIGNEXTEND
+    %rep 4  // 0x0c-0x0f, invalid
+        BYTES 0
+    %endrep
+
+    BYTES @GAS_VERYLOW  // 0x10, LT
+    BYTES @GAS_VERYLOW  // 0x11, GT
+    BYTES @GAS_VERYLOW  // 0x12, SLT
+    BYTES @GAS_VERYLOW  // 0x13, SGT
+    BYTES @GAS_VERYLOW  // 0x14, EQ
+    BYTES @GAS_VERYLOW  // 0x15, ISZERO
+    BYTES @GAS_VERYLOW  // 0x16, AND
+    BYTES @GAS_VERYLOW  // 0x17, OR
+    BYTES @GAS_VERYLOW  // 0x18, XOR
+    BYTES @GAS_VERYLOW  // 0x19, NOT
+    BYTES @GAS_VERYLOW  // 0x1a, BYTE
+    BYTES @GAS_VERYLOW  // 0x1b, SHL
+    BYTES @GAS_VERYLOW  // 0x1c, SHR
+    BYTES @GAS_VERYLOW  // 0x1d, SAR
+    BYTES 0  // 0x1e, invalid
+    BYTES 0  // 0x1f, invalid
+
+    BYTES 0  // 0x20, KECCAK256
+    %rep 15 // 0x21-0x2f, invalid
+        BYTES 0
+    %endrep
+
+    %rep 25 //0x30-0x48, only syscalls
+    BYTES 0  
+    %endrep
+
+    %rep 7  // 0x49-0x4f, invalid
+        BYTES 0
+    %endrep
+
+    BYTES @GAS_BASE  // 0x50, POP
+    BYTES 0  // 0x51, MLOAD
+    BYTES 0  // 0x52, MSTORE
+    BYTES 0  // 0x53, MSTORE8
+    BYTES 0  // 0x54, SLOAD
+    BYTES 0  // 0x55, SSTORE
+    BYTES @GAS_MID  // 0x56, JUMP
+    BYTES @GAS_HIGH  // 0x57, JUMPI
+    BYTES @GAS_BASE  // 0x58, PC
+    BYTES 0  // 0x59, MSIZE
+    BYTES 0  // 0x5a, GAS
+    BYTES @GAS_JUMPDEST  // 0x5b, JUMPDEST
+    %rep 3  // 0x5c-0x5e, invalid
+        BYTES 0
+    %endrep
+
+    BYTES @GAS_BASE // 0x5f, PUSH0
+    %rep 32 // 0x60-0x7f, PUSH1-PUSH32
+        BYTES @GAS_VERYLOW
+    %endrep
+
+    %rep 16 // 0x80-0x8f, DUP1-DUP16
+        BYTES @GAS_VERYLOW
+    %endrep
+
+    %rep 16 // 0x90-0x9f, SWAP1-SWAP16
+        BYTES @GAS_VERYLOW
+    %endrep
+
+    BYTES 0  // 0xa0, LOG0
+    BYTES 0  // 0xa1, LOG1
+    BYTES 0  // 0xa2, LOG2
+    BYTES 0  // 0xa3, LOG3
+    BYTES 0  // 0xa4, LOG4
+    %rep 11 // 0xa5-0xaf, invalid
+        BYTES 0
+    %endrep
+
+    %rep 64 // 0xb0-0xef, invalid
+        BYTES 0
+    %endrep
+
+    BYTES 0  // 0xf0, CREATE
+    BYTES 0  // 0xf1, CALL
+    BYTES 0  // 0xf2, CALLCODE
+    BYTES 0  // 0xf3, RETURN
+    BYTES 0  // 0xf4, DELEGATECALL
+    BYTES 0  // 0xf5, CREATE2
+    %rep 4  // 0xf6-0xf9, invalid
+        BYTES 0
+    %endrep
+    BYTES 0  // 0xfa, STATICCALL
+    BYTES 0  // 0xfb, invalid
+    BYTES 0  // 0xfc, invalid
+    BYTES 0  // 0xfd, REVERT
+    BYTES 0  // 0xfe, invalid
+    BYTES 0  // 0xff, SELFDESTRUCT
diff --git a/evm/src/cpu/kernel/asm/core/gas.asm b/evm/src/cpu/kernel/asm/core/gas.asm
index d5e4e9bb74..2e16c373e3 100644
--- a/evm/src/cpu/kernel/asm/core/gas.asm
+++ b/evm/src/cpu/kernel/asm/core/gas.asm
@@ -122,7 +122,7 @@ global sys_gasprice:
 //     L(n) = n - floor(n / 64)
 %macro all_but_one_64th
     // stack: n
-    DUP1 %div_const(64)
+    DUP1 %shr_const(6)
     // stack: floor(n / 64), n
     SWAP1 SUB
     // stack: n - floor(n / 64)
diff --git a/evm/src/cpu/kernel/asm/core/jumpdest_analysis.asm b/evm/src/cpu/kernel/asm/core/jumpdest_analysis.asm
index a9d8adf2ff..934d1f6297 100644
--- a/evm/src/cpu/kernel/asm/core/jumpdest_analysis.asm
+++ b/evm/src/cpu/kernel/asm/core/jumpdest_analysis.asm
@@ -1,64 +1,344 @@
-// Populates @SEGMENT_JUMPDEST_BITS for the given context's code.
-// Pre stack: ctx, code_len, retdest
+// Set @SEGMENT_JUMPDEST_BITS to one between positions [init_pos, final_pos], 
+// for the given context's code.
+// Pre stack: init_pos, ctx, final_pos, retdest
 // Post stack: (empty)
-global jumpdest_analysis:
-    // stack: ctx, code_len, retdest
-    PUSH 0 // i = 0
-
+global verify_path_and_write_jumpdest_table:
+    SWAP2
+    DUP2
+    ADD // final_addr
+    // stack: final_addr, ctx, i, retdest
+    SWAP2
+    ADD // init_addr
 loop:
-    // stack: i, ctx, code_len, retdest
-    // Ideally we would break if i >= code_len, but checking i > code_len is
-    // cheaper. It doesn't hurt to over-read by 1, since we'll read 0 which is
-    // a no-op.
-    DUP3 DUP2 GT // i > code_len
-    %jumpi(return)
-
-    // stack: i, ctx, code_len, retdest
-    %stack (i, ctx) -> (ctx, @SEGMENT_CODE, i, i, ctx)
-    MLOAD_GENERAL
-    // stack: opcode, i, ctx, code_len, retdest
+    // stack: i, final_pos, retdest
+    DUP2 DUP2 EQ // i == final_pos
+    %jumpi(proof_ok)
+    DUP2 DUP2 GT // i > final_pos
+    %jumpi(proof_not_ok)
 
-    DUP1 %eq_const(0x5b)
-    // stack: opcode == JUMPDEST, opcode, i, ctx, code_len, retdest
-    %jumpi(encountered_jumpdest)
+     // stack: i, final_pos, retdest
+    DUP1
+    MLOAD_GENERAL // SEGMENT_CODE == 0
+    // stack: opcode, i, final_pos, retdest
 
-    // stack: opcode, i, ctx, code_len, retdest
-    %code_bytes_to_skip
-    // stack: bytes_to_skip, i, ctx, code_len, retdest
-    ADD
-    %jump(continue)
+    DUP1 
+    // Slightly more efficient than `%eq_const(0x5b) ISZERO`
+    PUSH 0x5b
+    SUB
+    // stack: opcode != JUMPDEST, opcode, i, final_pos, retdest
+    %jumpi(continue)
 
-encountered_jumpdest:
-    // stack: opcode, i, ctx, code_len, retdest
-    POP
-    // stack: i, ctx, code_len, retdest
-    %stack (i, ctx) -> (ctx, @SEGMENT_JUMPDEST_BITS, i, 1, i, ctx)
+    // stack: JUMPDEST, i, code_len, retdest
+    %stack (JUMPDEST, i) -> (@SEGMENT_JUMPDEST_BITS, i, JUMPDEST, i)
+    ADD // address to write jumpdest bit, i already contains the context
+    PUSH 1
+    // stack: 1, addr, JUMPDEST, i
     MSTORE_GENERAL
 
 continue:
-    // stack: i, ctx, code_len, retdest
-    %increment
+    // stack: opcode, i, final_pos, retdest
+    %add_const(code_bytes_to_skip)
+    %mload_kernel_code
+    // stack: bytes_to_skip, i, final_pos, retdest
+    ADD
+    // stack: i, final_pos, retdest
     %jump(loop)
 
-return:
-    // stack: i, ctx, code_len, retdest
-    %pop3
+proof_ok:
+    // stack: i, final_pos, retdest
+    // We already know final_pos is a jumpdest
+    %stack (i, final_pos) -> (@SEGMENT_JUMPDEST_BITS, final_pos)
+    ADD // final_pos already contains the context
+    PUSH 1
+    MSTORE_GENERAL
+    JUMP
+proof_not_ok:
+    %pop2
     JUMP
 
-// Determines how many bytes to skip, if any, based on the opcode we read.
-// If we read a PUSH<n> opcode, we skip over n bytes, otherwise we skip 0.
+// Determines how many bytes away is the next opcode, based on the opcode we read.
+// If we read a PUSH<n> opcode, next opcode is in n + 1 bytes, otherwise it's the next one.
 //
 // Note that the range of PUSH opcodes is [0x60, 0x80). I.e. PUSH1 is 0x60
 // and PUSH32 is 0x7f.
-%macro code_bytes_to_skip
-    // stack: opcode
-    %sub_const(0x60)
-    // stack: opcode - 0x60
-    DUP1 %lt_const(0x20)
-    // stack: is_push_opcode, opcode - 0x60
+code_bytes_to_skip:
+    %rep 96
+        BYTES 1 // 0x00-0x5f
+    %endrep
+
+    BYTES 2
+    BYTES 3
+    BYTES 4
+    BYTES 5
+    BYTES 6
+    BYTES 7
+    BYTES 8
+    BYTES 9
+    BYTES 10
+    BYTES 11
+    BYTES 12
+    BYTES 13
+    BYTES 14
+    BYTES 15
+    BYTES 16
+    BYTES 17
+    BYTES 18
+    BYTES 19
+    BYTES 20
+    BYTES 21
+    BYTES 22
+    BYTES 23
+    BYTES 24
+    BYTES 25
+    BYTES 26
+    BYTES 27
+    BYTES 28
+    BYTES 29
+    BYTES 30
+    BYTES 31
+    BYTES 32
+    BYTES 33
+
+    %rep 128
+        BYTES 1 // 0x80-0xff
+    %endrep
+
+
+// A proof attesting that jumpdest is a valid jump destination is
+// either 0 or an index 0 < i <= jumpdest - 32.
+// A proof is valid if:
+// - i == 0 and we can go from the first opcode to jumpdest and code[jumpdest] = 0x5b
+// - i > 0 and:
+//     a) for j in {i+0,..., i+31} code[j] != PUSHk for all k >= 32 - j - i,
+//     b) we can go from opcode i+32 to jumpdest,
+//     c) code[jumpdest] = 0x5b.
+// To reduce the number of instructions, when i > 32 we load all the bytes code[j], ...,
+// code[j + 31] in a single 32-byte word, and check a) directly on the packed bytes.
+// We perform the "packed verification" computing a boolean formula evaluated on the bits of 
+// code[j],..., code[j+31] of the form p_1 AND p_2 AND p_3 AND p_4 AND p_5, where:
+//     - p_k is either TRUE, for one subset of the j's which depends on k (for example,
+//       for k = 1, it is TRUE for the first 15 positions), or has_prefix_k => bit_{k + 1}_is_0
+//       for the j's not in the subset.
+//     - has_prefix_k is a predicate that is TRUE if and only if code[j] has the same prefix of size k + 2
+//       as PUSH{32-(j-i)}.
+// stack: proof_prefix_addr, jumpdest, ctx, retdest
+// stack: (empty)
+global write_table_if_jumpdest:
+    // stack: proof_prefix_addr, jumpdest, ctx, retdest
+    %stack
+        (proof_prefix_addr, jumpdest, ctx) ->
+        (ctx, jumpdest, jumpdest, ctx, proof_prefix_addr)
+    ADD // combine context and offset to make an address (SEGMENT_CODE == 0)
+    MLOAD_GENERAL
+    // stack: opcode, jumpdest, ctx, proof_prefix_addr, retdest
+
+    %jump_neq_const(0x5b, return)
+
+    //stack: jumpdest, ctx, proof_prefix_addr, retdest
+    SWAP2 DUP1
+    // stack: proof_prefix_addr, proof_prefix_addr, ctx, jumpdest
+    ISZERO
+    %jumpi(verify_path_and_write_jumpdest_table)
+
+
+    // stack: proof_prefix_addr, ctx, jumpdest, retdest
+    // If we are here we need to check that the next 32 bytes are less
+    // than JUMPXX for XX < 32 - i <=> opcode < 0x7f - i = 127 - i, 0 <= i < 32,
+    // or larger than 127
+    
+    %stack
+        (proof_prefix_addr, ctx) ->
+        (ctx, proof_prefix_addr, 32, proof_prefix_addr, ctx)
+    ADD // combine context and offset to make an address (SEGMENT_CODE == 0)
+    MLOAD_32BYTES
+    // packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP1 %shl_const(1)
+    DUP2 %shl_const(2)
+    AND
+    // stack: (is_1_at_pos_2_and_3|(X)⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    // X denotes any value in {0,1} and Z^i is Z repeated i times
+    NOT
+    // stack: (is_0_at_2_or_3|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP2
+    OR
+    // stack: (is_1_at_1 or is_0_at_2_or_3|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    // stack: (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+
+    // Compute in_range and has_prefix' = 
+    //   - in_range = (0xFF|X⁷)³²                     and ~has_prefix' = ~has_prefix OR is_0_at_4, for the first 15 bytes
+    //   - in_range = (has_prefix => is_0_at_4 |X⁷)³² and ~has_prefix' = ~has_prefix,              for the next 15 bytes
+    //   - in_range = (~has_prefix|X⁷)³²              and ~has_prefix' = ~has_prefix,              for the last byte.
+    DUP2 %shl_const(3)
+    NOT
+    // stack: (is_0_at_4|X⁷)³²,  (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00
+    AND
+    // stack: (is_0_at_4|X⁷)³¹|0⁸,  (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP1
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000
+    AND
+    // stack: (is_0_at_4|X⁷)¹⁵|(0⁸)¹⁷, (is_0_at_4|X⁷)³¹|0⁸, (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP3
+    OR
+    // (~has_prefix'|X⁷)³²,  (is_0_at_4|X⁷)³¹|0⁸, (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    SWAP2
+    OR
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000
+    OR
+    // stack: (in_range|X⁷)³², (~has_prefix'|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+
+    // Compute in_range' and ~has_prefix as
+    //   - in_range' = in_range                                     and has_prefix' = ~has_prefix OR is_0_at_5, for bytes in positions 1-7 and 16-23 
+    //   - in_range' = in_range AND (has_prefix => is_0_at_5 |X⁷)³² and has_prefix' = ~has_prefix,              for the rest.
+
+    DUP3 %shl_const(4)
+    NOT
+    // stack: (is_0_at_5|X⁷)³²,  (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP1
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFFFFFFFFFFFFFF0000000000000000FFFFFFFFFFFFFFFF000000000000000000
+    AND
+    // stack: (is_0_at_5|X⁷)⁷|(0⁸)⁸|(is_0_at_5|X⁷)⁸|(0⁸)⁸, (is_0_at_5|X⁷)³²,  (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP4
+    OR
+    // stack: (~has_prefix'|X⁷)³², (is_0_at_5|X⁷)³²,  (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    SWAP3
+    OR
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFFFFFFFFFFFFFF0000000000000000FFFFFFFFFFFFFFFF000000000000000000
+    OR
+    AND
+    // stack: (in_range'|X⁷)³²,  (~has_prefix'|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+
+    // Compute in_range' and ~has_prefix' as
+    //   - in_range' = in_range                                     and ~has_prefix' = ~has_prefix OR is_0_at_6, for bytes in positions 1-3, 8-11, 16-19, and 24-27 
+    //   - in_range' = in_range AND (has_prefix => is_0_at_6 |X⁷)³² and ~has_prefix' = has_prefix,               for the rest.
+    DUP3 %shl_const(5)
+    NOT
+    // stack: (is_0_at_6|X⁷)³²,  (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP1
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFFFFFF00000000FFFFFFFF00000000FFFFFFFF00000000FFFFFFFF0000000000
+    AND
+    // stack: (is_0_at_6|X⁷)³|(0⁸)⁴|((is_0_at_6|X⁷)⁴|(0⁸)⁴)³, (is_0_at_6|X⁷)³²,  (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP4
+    OR
+    // stack: (~has_prefix'|X⁷)³², (is_0_at_6|X⁷)³²,  (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    SWAP3
+    OR
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFFFFFF00000000FFFFFFFF00000000FFFFFFFF00000000FFFFFFFF0000000000
+    OR
+    AND
+    // stack: (in_range'|X⁷)³², (~has_prefix'|X⁷)³², (in_range|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+
+    // Compute in_range' and ~has_prefix' as
+    //   - in_range' = in_range                                     and ~has_prefix' = has_prefix OR is_0_at_7, for bytes in 1, 4-5, 8-9, 12-13, 16-17, 20-21, 24-25, 28-29
+    //   - in_range' = in_range AND (has_prefix => is_0_at_7 |X⁷)³² and ~has_prefix' = ~has_prefix,             for the rest.
+    DUP3 %shl_const(6)
+    NOT
+    // stack: (is_0_at_7|X⁷)³², (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP1
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFF0000FFFF0000FFFF0000FFFF0000FFFF0000FFFF0000FFFF0000FFFF000000
+    AND
+    // stack:  is_0_at_7|X⁷|(0⁸)²|((is_0_at_7|X⁷)²|(0⁸)²)⁷, (is_0_at_7|X⁷)³², (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP4
+    OR
+    // (~has_prefix'|X⁷)³², (is_0_at_7|X⁷)³²,  (in_range|X⁷)³², (~has_prefix|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    SWAP3
+    OR
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0xFF0000FFFF0000FFFF0000FFFF0000FFFF0000FFFF0000FFFF0000FFFF000000
+    OR
+    AND
+    // stack: (in_range'|X⁷)³², (~has_prefix'|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+
+    // Compute in_range' as
+    //   - in_range' = in_range,                                    for odd positions
+    //   - in_range' = in_range AND (has_prefix => is_0_at_8 |X⁷)³², for the rest
+
     SWAP1
-    %increment // n = opcode - 0x60 + 1
-    // stack: n, is_push_opcode
-    MUL
-    // stack: bytes_to_skip
+    // stack: (~has_prefix|X⁷)³², (in_range|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    DUP3 %shl_const(7)
+    NOT
+    // stack: (is_0_at_8|X⁷)³², (~has_prefix|X⁷)³², (in_range|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+    OR
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0x00FF00FF00FF00FF00FF00FF00FF00FF00FF00FF00FF00FF00FF00FF00FF00FF
+    OR
+    AND
+    // stack: (in_range|X⁷)³², packed_opcodes, proof_prefix_addr, ctx, jumpdest, retdest
+
+    // Get rid of the irrelevant bits
+    // pos 0102030405060708091011121314151617181920212223242526272829303132
+    PUSH 0x8080808080808080808080808080808080808080808080808080808080808080
+    AND
+    %jump_neq_const(0x8080808080808080808080808080808080808080808080808080808080808080, return_pop_opcode)
+    POP
+    %add_const(32)
+
+    // check the remaining path
+    %jump(verify_path_and_write_jumpdest_table)
+return_pop_opcode:
+    POP
+return:
+    // stack: proof_prefix_addr, ctx, jumpdest, retdest
+    // or
+    // stack: jumpdest, ctx, proof_prefix_addr, retdest
+    %pop3
+    JUMP
+
+%macro write_table_if_jumpdest
+    %stack (proof_prefix_addr, jumpdest, ctx) -> (proof_prefix_addr, jumpdest, ctx, %%after)
+    %jump(write_table_if_jumpdest)
+%%after:
+%endmacro
+
+// Write the jumpdest table. This is done by
+// non-deterministically guessing the sequence of jumpdest
+// addresses used during program execution within the current context.
+// For each jumpdest address we also non-deterministically guess
+// a proof, which is another address in the code such that 
+// is_jumpdest doesn't abort, when the proof is at the top of the stack
+// an the jumpdest address below. If that's the case we set the
+// corresponding bit in @SEGMENT_JUMPDEST_BITS to 1.
+// 
+// stack: ctx, code_len, retdest
+// stack: (empty)
+global jumpdest_analysis:
+    // If address > 0 then address is interpreted as address' + 1
+    // and the next prover input should contain a proof for address'.
+    PROVER_INPUT(jumpdest_table::next_address)
+    DUP1 %jumpi(check_proof)
+    // If address == 0 there are no more jump destinations to check
+    POP
+// This is just a hook used for avoiding verification of the jumpdest
+// table in another context. It is useful during proof generation,
+// allowing the avoidance of table verification when simulating user code.
+global jumpdest_analysis_end:
+    %pop2
+    JUMP
+check_proof:
+    // stack: address, ctx, code_len, retdest
+    DUP3 DUP2 %assert_le
+    %decrement
+    // stack: proof, ctx, code_len, retdest
+    DUP2 SWAP1
+    // stack: address, ctx, ctx, code_len, retdest
+    // We read the proof
+    PROVER_INPUT(jumpdest_table::next_proof)
+    // stack: proof, address, ctx, ctx, code_len, retdest
+    %write_table_if_jumpdest
+    // stack: ctx, code_len, retdest
+    
+    %jump(jumpdest_analysis)
+
+%macro jumpdest_analysis
+    %stack (ctx, code_len) -> (ctx, code_len, %%after)
+    %jump(jumpdest_analysis)
+%%after:
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/core/log.asm b/evm/src/cpu/kernel/asm/core/log.asm
index 0689d49211..f23d5e174c 100644
--- a/evm/src/cpu/kernel/asm/core/log.asm
+++ b/evm/src/cpu/kernel/asm/core/log.asm
@@ -206,22 +206,27 @@ log_after_topics:
     // stack: next_log_ptr, data_ptr, data_offset, retdest
     SWAP1
     // stack: data_ptr, next_log_ptr, data_offset, retdest
+    SWAP2
+    PUSH @SEGMENT_MAIN_MEMORY GET_CONTEXT %build_address
+    SWAP2
+    // stack: data_ptr, next_log_ptr, data_addr, retdest
+    
 
 store_log_data_loop:
-    // stack: cur_data_ptr, next_log_ptr, cur_data_offset, retdest
+    // stack: cur_data_ptr, next_log_ptr, cur_data_addr, retdest
     DUP2 DUP2 EQ
-    // stack: cur_data_ptr == next_log_ptr, cur_data_ptr, next_log_ptr, cur_data_offset, retdest
+    // stack: cur_data_ptr == next_log_ptr, cur_data_ptr, next_log_ptr, cur_data_addr, retdest
     %jumpi(store_log_data_loop_end)
-    // stack: cur_data_ptr, next_log_ptr, cur_data_offset, retdest
+    // stack: cur_data_ptr, next_log_ptr, cur_data_addr, retdest
     DUP3
-    %mload_current(@SEGMENT_MAIN_MEMORY)
-    // stack: cur_data, cur_data_ptr, next_log_ptr, cur_data_offset, retdest
+    MLOAD_GENERAL
+    // stack: cur_data, cur_data_ptr, next_log_ptr, cur_data_addr, retdest
     // Store current data byte.
     DUP2
     %mstore_kernel(@SEGMENT_LOGS_DATA)
-    // stack: cur_data_ptr, next_log_ptr, cur_data_offset, retdest
+    // stack: cur_data_ptr, next_log_ptr, cur_data_addr, retdest
     SWAP2 %increment SWAP2
-    // stack: cur_data_ptr, next_log_ptr, next_data_offset, retdest
+    // stack: cur_data_ptr, next_log_ptr, next_data_addr, retdest
     %increment
     %jump(store_log_data_loop)
 
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/blake2_f.asm b/evm/src/cpu/kernel/asm/core/precompiles/blake2_f.asm
index 01c027156f..91d4b3960f 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/blake2_f.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/blake2_f.asm
@@ -29,7 +29,8 @@ global precompile_blake2_f:
     // stack: flag_addr, flag_addr, blake2_f_contd, kexit_info
     PUSH @SEGMENT_CALLDATA
     GET_CONTEXT
-    // stack: ctx, @SEGMENT_CALLDATA, flag_addr, flag_addr, blake2_f_contd, kexit_info
+    %build_address
+    // stack: addr, flag_addr, blake2_f_contd, kexit_info
     MLOAD_GENERAL
     // stack: flag, flag_addr, blake2_f_contd, kexit_info
     DUP1
@@ -45,6 +46,7 @@ global precompile_blake2_f:
     // stack: @SEGMENT_CALLDATA, t1_addr, t1_addr, flag, blake2_f_contd, kexit_info
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, t1_addr, t1_addr, flag, blake2_f_contd, kexit_info
+    %build_address
     %mload_packing_u64_LE
     // stack: t_1, t1_addr, flag, blake2_f_contd, kexit_info
     SWAP1
@@ -56,6 +58,7 @@ global precompile_blake2_f:
     // stack: @SEGMENT_CALLDATA, t0_addr, t0_addr, t_1, flag, blake2_f_contd, kexit_info
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, t0_addr, t0_addr, t_1, flag, blake2_f_contd, kexit_info
+    %build_address
     %mload_packing_u64_LE
     // stack: t_0, t0_addr, t_1, flag, blake2_f_contd, kexit_info
     SWAP1
@@ -71,6 +74,7 @@ global precompile_blake2_f:
         // stack: @SEGMENT_CALLDATA, m0_addr + 8 * (16 - i - 1), m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
         GET_CONTEXT
         // stack: ctx, @SEGMENT_CALLDATA, m0_addr + 8 * (16 - i - 1), m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
+        %build_address
         %mload_packing_u64_LE
         // stack: m_i, m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
         SWAP1
@@ -88,6 +92,7 @@ global precompile_blake2_f:
         // stack: @SEGMENT_CALLDATA, h0_addr + 8 * (8 - i), h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
         GET_CONTEXT
         // stack: ctx, @SEGMENT_CALLDATA, h0_addr + 8 * (8 - i), h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
+        %build_address
         %mload_packing_u64_LE
         // stack: h_i, h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
         SWAP1
@@ -96,10 +101,11 @@ global precompile_blake2_f:
     // stack: h0_addr + 8 * 8 = 68, h_0, ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
     POP
 
-    %stack () -> (@SEGMENT_CALLDATA, 0, 4)
+    %stack () -> (@SEGMENT_CALLDATA, 4)
     GET_CONTEXT
-    // stack: ctx, @SEGMENT_CALLDATA, 0, 4, h_0..h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
-    %mload_packing
+    // stack: ctx, @SEGMENT_CALLDATA, 4, h_0..h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
+    %build_address_no_offset
+    MLOAD_32BYTES
     // stack: rounds, h_0..h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info
     
     DUP1
@@ -113,20 +119,20 @@ blake2_f_contd:
     // Store the result hash to the parent's return data using `mstore_unpacking_u64_LE`.
 
     %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 64)
-    PUSH 0
-    // stack: addr_0=0, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', kexit_info
+    // stack: h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', kexit_info
+    PUSH @SEGMENT_RETURNDATA
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    // stack: parent_ctx, addr_0=0, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', kexit_info
+    // stack: parent_ctx, segment, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', kexit_info
+    %build_address_no_offset
+    // stack: addr0=0, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', kexit_info
 
     %rep 8
-        // stack: parent_ctx, addr_i, h_i', ..., h_7', kexit_info
-        %stack (ctx, addr, h_i) -> (ctx, @SEGMENT_RETURNDATA, addr, h_i, addr, ctx)
+        // stack: addri, h_i', ..., h_7', kexit_info
+        %stack (addr, h_i) -> (addr, h_i, addr)
         %mstore_unpacking_u64_LE
-        // stack: addr_i, parent_ctx, h_(i+1)', ..., h_7', kexit_info
+        // stack: addr_i, h_(i+1)', ..., h_7', kexit_info
         %add_const(8)
-        // stack: addr_(i+1), parent_ctx, h_(i+1)', ..., h_7', kexit_info
-        SWAP1
-        // stack: parent_ctx, addr_(i+1), h_(i+1)', ..., h_7', kexit_info
+        // stack: addr_(i+1), h_(i+1)', ..., h_7', kexit_info
     %endrep
 
     // stack: kexit_info    
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/bn_add.asm b/evm/src/cpu/kernel/asm/core/precompiles/bn_add.asm
index 1dafbe8a43..9554044eff 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/bn_add.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/bn_add.asm
@@ -14,28 +14,32 @@ global precompile_bn_add:
 
     %charge_gas_const(@BN_ADD_GAS)
 
-    // Load x0, y0, x1, y1 from the call data using `mload_packing`.
+    // Load x0, y0, x1, y1 from the call data using `MLOAD_32BYTES`.
     PUSH bn_add_return
     // stack: bn_add_return, kexit_info
     %stack () -> (@SEGMENT_CALLDATA, 96, 32)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 96, 32, bn_add_return, kexit_info
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: y1, bn_add_return, kexit_info
     %stack () -> (@SEGMENT_CALLDATA, 64, 32)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 64, 32, y1, bn_add_return, kexit_info
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: x1, y1, bn_add_return, kexit_info
     %stack () -> (@SEGMENT_CALLDATA, 32, 32)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 32, 32, x1, y1, bn_add_return, kexit_info
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: y0, x1, y1, bn_add_return, kexit_info
-    %stack () -> (@SEGMENT_CALLDATA, 0, 32)
+    %stack () -> (@SEGMENT_CALLDATA, 32)
     GET_CONTEXT
-    // stack: ctx, @SEGMENT_CALLDATA, 0, 32, y0, x1, y1, bn_add_return, kexit_info
-    %mload_packing
+    // stack: ctx, @SEGMENT_CALLDATA, 32, y0, x1, y1, bn_add_return, kexit_info
+    %build_address_no_offset
+    MLOAD_32BYTES
     // stack: x0, y0, x1, y1, bn_add_return, kexit_info
     %jump(bn_add)
 bn_add_return:
@@ -49,9 +53,11 @@ bn_add_return:
     // Store the result (x, y) to the parent's return data using `mstore_unpacking`.
     %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 64)
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx, x, y) -> (parent_ctx, @SEGMENT_RETURNDATA, 0, x, 32, bn_add_contd6, parent_ctx, y)
-    %jump(mstore_unpacking)
-bn_add_contd6:
+    %stack (parent_ctx, x, y) -> (parent_ctx, @SEGMENT_RETURNDATA, x, parent_ctx, y)
+    %build_address_no_offset
+    MSTORE_32BYTES_32
     POP
-    %stack (parent_ctx, y) -> (parent_ctx, @SEGMENT_RETURNDATA, 32, y, 32, pop_and_return_success)
-    %jump(mstore_unpacking)
+    %stack (parent_ctx, y) -> (parent_ctx, @SEGMENT_RETURNDATA, 32, y)
+    %build_address
+    MSTORE_32BYTES_32
+    %jump(pop_and_return_success)
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/bn_mul.asm b/evm/src/cpu/kernel/asm/core/precompiles/bn_mul.asm
index b3865506d8..5872e17f26 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/bn_mul.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/bn_mul.asm
@@ -14,23 +14,26 @@ global precompile_bn_mul:
 
     %charge_gas_const(@BN_MUL_GAS)
 
-    // Load x, y, n from the call data using `mload_packing`.
+    // Load x, y, n from the call data using `MLOAD_32BYTES`.
     PUSH bn_mul_return
     // stack: bn_mul_return, kexit_info
     %stack () -> (@SEGMENT_CALLDATA, 64, 32)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 64, 32, bn_mul_return, kexit_info
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: n, bn_mul_return, kexit_info
     %stack () -> (@SEGMENT_CALLDATA, 32, 32)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 32, 32, n, bn_mul_return, kexit_info
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: y, n, bn_mul_return, kexit_info
-    %stack () -> (@SEGMENT_CALLDATA, 0, 32)
+    %stack () -> (@SEGMENT_CALLDATA, 32)
     GET_CONTEXT
-    // stack: ctx, @SEGMENT_CALLDATA, 0, 32, y, n, bn_mul_return, kexit_info
-    %mload_packing
+    // stack: ctx, @SEGMENT_CALLDATA, 32, y, n, bn_mul_return, kexit_info
+    %build_address_no_offset
+    MLOAD_32BYTES
     // stack: x, y, n, bn_mul_return, kexit_info
     %jump(bn_mul)
 bn_mul_return:
@@ -44,9 +47,12 @@ bn_mul_return:
     // Store the result (Px, Py) to the parent's return data using `mstore_unpacking`.
     %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 64)
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx, Px, Py) -> (parent_ctx, @SEGMENT_RETURNDATA, 0, Px, 32, bn_mul_contd6, parent_ctx, Py)
-    %jump(mstore_unpacking)
+    %stack (parent_ctx, Px, Py) -> (parent_ctx, @SEGMENT_RETURNDATA, Px, parent_ctx, Py)
+    %build_address_no_offset
+    MSTORE_32BYTES_32
 bn_mul_contd6:
     POP
-    %stack (parent_ctx, Py) -> (parent_ctx, @SEGMENT_RETURNDATA, 32, Py, 32, pop_and_return_success)
-    %jump(mstore_unpacking)
+    %stack (parent_ctx, Py) -> (parent_ctx, @SEGMENT_RETURNDATA, 32, Py)
+    %build_address
+    MSTORE_32BYTES_32
+    %jump(pop_and_return_success)
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/ecrec.asm b/evm/src/cpu/kernel/asm/core/precompiles/ecrec.asm
index b38307c4db..6c141aabc5 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/ecrec.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/ecrec.asm
@@ -14,28 +14,32 @@ global precompile_ecrec:
 
     %charge_gas_const(@ECREC_GAS)
 
-    // Load hash, v, r, s from the call data using `mload_packing`.
+    // Load hash, v, r, s from the call data using `MLOAD_32BYTES`.
     PUSH ecrec_return
     // stack: ecrec_return, kexit_info
     %stack () -> (@SEGMENT_CALLDATA, 96, 32)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 96, 32, ecrec_return, kexit_info
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: s, ecrec_return, kexit_info
     %stack () -> (@SEGMENT_CALLDATA, 64, 32)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 64, 32, s, ecrec_return, kexit_info
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: r, s, ecrec_return, kexit_info
     %stack () -> (@SEGMENT_CALLDATA, 32, 32)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 32, 32, r, s, ecrec_return, kexit_info
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: v, r, s, ecrec_return, kexit_info
-    %stack () -> (@SEGMENT_CALLDATA, 0, 32)
+    %stack () -> (@SEGMENT_CALLDATA, 32)
     GET_CONTEXT
-    // stack: ctx, @SEGMENT_CALLDATA, 0, 32, v, r, s, ecrec_return, kexit_info
-    %mload_packing
+    // stack: ctx, @SEGMENT_CALLDATA, 32, v, r, s, ecrec_return, kexit_info
+    %build_address_no_offset
+    MLOAD_32BYTES
     // stack: hash, v, r, s, ecrec_return, kexit_info
     %jump(ecrecover)
 ecrec_return:
@@ -45,8 +49,10 @@ ecrec_return:
     // Store the result address to the parent's return data using `mstore_unpacking`.
     %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 32)
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx, address) -> (parent_ctx, @SEGMENT_RETURNDATA, 0, address, 32, pop_and_return_success)
-    %jump(mstore_unpacking)
+    %stack (parent_ctx, address) -> (parent_ctx, @SEGMENT_RETURNDATA, address)
+    %build_address_no_offset
+    MSTORE_32BYTES_32
+    %jump(pop_and_return_success)
 
 // On bad input, return empty return data but still return success.
 ecrec_bad_input:
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/expmod.asm b/evm/src/cpu/kernel/asm/core/precompiles/expmod.asm
index 2185ee2c9d..6bff54ea4e 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/expmod.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/expmod.asm
@@ -11,43 +11,43 @@
 // We pass around total_num_limbs and len for conveience, because we can't access them from the stack
 // if they're hidden behind the variable number of limbs.
 mload_bytes_as_limbs:
-    // stack: ctx, segment, offset, num_bytes, retdest, total_num_limbs, len, ..limbs
-    DUP4
-    // stack: num_bytes, ctx, segment, offset, num_bytes, retdest, total_num_limbs, len, ..limbs
+    // stack: addr, num_bytes, retdest, total_num_limbs, len, ..limbs
+    DUP2
+    // stack: num_bytes, addr, num_bytes, retdest, total_num_limbs, len, ..limbs
     %mod_16
-    // stack: min(16, num_bytes), ctx, segment, offset, num_bytes, retdest, total_num_limbs, len, ..limbs
-    %stack (len, addr: 3) -> (addr, len, addr)
-    // stack: ctx, segment, offset, min(16, num_bytes), ctx, segment, offset, num_bytes, retdest, total_num_limbs, len, ..limbs
-    %mload_packing
-    // stack: new_limb, ctx, segment, offset, num_bytes, retdest, total_num_limbs, len, ..limbs
-    %stack (new, addr: 3, numb, ret, tot, len) -> (numb, addr, ret, tot, len, new)
-    // stack: num_bytes, ctx, segment, offset, retdest, total_num_limbs, len, new_limb, ..limbs
+    // stack: min(16, num_bytes), addr, num_bytes, retdest, total_num_limbs, len, ..limbs
+    DUP2
+    // stack: addr, min(16, num_bytes), addr, num_bytes, retdest, total_num_limbs, len, ..limbs
+    MLOAD_32BYTES
+    // stack: new_limb, addr, num_bytes, retdest, total_num_limbs, len, ..limbs
+    %stack (new, addr, numb, ret, tot, len) -> (numb, addr, ret, tot, len, new)
+    // stack: num_bytes, addr, retdest, total_num_limbs, len, new_limb, ..limbs
     DUP1
     %mod_16
-    // stack: num_bytes%16, num_bytes, ctx, segment, offset, retdest, total_num_limbs, len, new_limb, ..limbs
+    // stack: num_bytes%16, num_bytes, addr, retdest, total_num_limbs, len, new_limb, ..limbs
     DUP1 SWAP2
     SUB
-    // stack:num_bytes_new, num_bytes%16, ctx, segment, offset, retdest, total_num_limbs, len, new_limb, ..limbs
+    // stack: num_bytes_new, num_bytes%16, addr, retdest, total_num_limbs, len, new_limb, ..limbs
     DUP1
     ISZERO
     %jumpi(mload_bytes_return)
     SWAP1
-    // stack: num_bytes%16, num_bytes_new, ctx, segment, offset, retdest, total_num_limbs, len, new_limb, ..limbs
-    DUP5 // offset
-    ADD
-    // stack: offset_new, num_bytes_new, ctx, segment, offset, retdest, total_num_limbs, len, new_limb, ..limbs
-    SWAP4 POP
-    // stack: num_bytes_new, ctx, segment, offset_new, retdest, total_num_limbs, len, new_limb, ..limbs
-    %stack (num, addr: 3) -> (addr, num)
+    // stack: num_bytes%16, num_bytes_new, addr, retdest, total_num_limbs, len, new_limb, ..limbs
+    DUP3 // addr
+    ADD // increment offset
+    // stack: addr_new, num_bytes_new, addr, retdest, total_num_limbs, len, new_limb, ..limbs
+    SWAP2 POP
+    // stack: num_bytes_new, addr_new, retdest, total_num_limbs, len, new_limb, ..limbs
+    SWAP1
     %jump(mload_bytes_as_limbs)
 mload_bytes_return:
-    // stack: num_bytes_new, num_bytes%16, ctx, segment, offset, retdest, total_num_limbs, len, new_limb, ..limbs
-    %pop5
+    // stack: num_bytes_new, num_bytes%16, addr, retdest, total_num_limbs, len, new_limb, ..limbs
+    %pop3
     // stack: retdest, total_num_limbs, len, ..limbs
     JUMP
 
 %macro mload_bytes_as_limbs
-    %stack (ctx, segment, offset, num_bytes, total_num_limbs) -> (ctx, segment, offset, num_bytes, %%after, total_num_limbs)
+    %stack (addr, num_bytes, total_num_limbs) -> (addr, num_bytes, %%after, total_num_limbs)
     %jump(mload_bytes_as_limbs)
 %%after:
 %endmacro
@@ -112,7 +112,8 @@ calculate_l_E_prime:
     // stack: 96 + l_B, 32, l_E, l_B, retdest
     PUSH @SEGMENT_CALLDATA
     GET_CONTEXT
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: i[96 + l_B..128 + l_B], l_E, l_B, retdest
     %log2_floor
     // stack: log2(i[96 + l_B..128 + l_B]), l_E, l_B, retdest
@@ -142,7 +143,8 @@ case_le_32:
     // stack: 96 + l_B, l_E, retdest
     PUSH @SEGMENT_CALLDATA
     GET_CONTEXT
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: E, retdest
     %log2_floor
     // stack: log2(E), retdest
@@ -165,23 +167,26 @@ global precompile_expmod:
     // stack: kexit_info
 
     // Load l_B from i[0..32].
-    %stack () -> (@SEGMENT_CALLDATA, 0, 32)
-    // stack: @SEGMENT_CALLDATA, 0, 32, kexit_info
+    %stack () -> (@SEGMENT_CALLDATA, 32)
+    // stack: @SEGMENT_CALLDATA, 32, kexit_info
     GET_CONTEXT
-    // stack: ctx, @SEGMENT_CALLDATA, 0, 32, kexit_info
-    %mload_packing
+    // stack: ctx, @SEGMENT_CALLDATA, 32, kexit_info
+    %build_address_no_offset
+    MLOAD_32BYTES
     // stack: l_B, kexit_info
 
     // Load l_E from i[32..64].
     %stack () -> (@SEGMENT_CALLDATA, 32, 32)
     GET_CONTEXT
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: l_E, l_B, kexit_info
 
     // Load l_M from i[64..96].
     %stack () -> (@SEGMENT_CALLDATA, 64, 32)
     GET_CONTEXT
-    %mload_packing
+    %build_address
+    MLOAD_32BYTES
     // stack: l_M, l_E, l_B, kexit_info
     DUP3 ISZERO DUP2 ISZERO
     MUL // AND
@@ -247,6 +252,7 @@ l_E_prime_return:
     %stack () -> (@SEGMENT_CALLDATA, 96)
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 96, num_bytes, num_limbs, len, len, l_M, l_E, l_B, kexit_info
+    %build_address
     %mload_bytes_as_limbs
     // stack: num_limbs, len, limbs[num_limbs-1], .., limbs[0], len, l_M, l_E, l_B, kexit_info
     SWAP1
@@ -282,6 +288,7 @@ copy_b_end:
     PUSH @SEGMENT_CALLDATA
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 96 + l_B, num_bytes, num_limbs, len, len, l_M, l_E, l_B, kexit_info
+    %build_address
     %mload_bytes_as_limbs
     // stack: num_limbs, len, limbs[num_limbs-1], .., limbs[0], len, l_M, l_E, l_B, kexit_info
     SWAP1
@@ -316,6 +323,7 @@ copy_e_end:
     PUSH @SEGMENT_CALLDATA
     GET_CONTEXT
     // stack: ctx, @SEGMENT_CALLDATA, 96 + l_B + l_E, num_bytes, num_limbs, len, len, l_M, l_E, l_B, kexit_info
+    %build_address
     %mload_bytes_as_limbs
     // stack: num_limbs, len, limbs[num_limbs-1], .., limbs[0], len, l_M, l_E, l_B, kexit_info
     SWAP1
@@ -410,33 +418,33 @@ expmod_contd:
     DUP2
     DUP2
     ADD
-    // stack: cur_address=out+l_M_128-1, end_address=out-1, l_M_128, l_M%16, kexit_info
+    // stack: cur_offset=out+l_M_128-1, end_offset=out-1, l_M_128, l_M%16, kexit_info
     DUP1 %mload_current_general
-    %stack (cur_limb, cur_address, end_address, l_M_128, l_M_mod16, kexit_info) ->
-        (@SEGMENT_RETURNDATA, 0, cur_limb, l_M_mod16, cur_address, end_address, l_M_128, kexit_info)
+    %stack (cur_limb, cur_offset, end_offset, l_M_128, l_M_mod16, kexit_info) ->
+        (@SEGMENT_RETURNDATA, cur_limb, l_M_mod16, cur_offset, end_offset, l_M_128, kexit_info)
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
+    %build_address_no_offset
     %mstore_unpacking
-    // stack: offset, cur_address, end_address, l_M_128, kexit_info
+    // stack: address, cur_offset, end_offset, l_M_128, kexit_info
     SWAP1
     %decrement
-    // stack: cur_address, offset, end_address, l_M_128, kexit_info
+    // stack: cur_offset, address, end_offset, l_M_128, kexit_info
     // Store in big-endian format.
 expmod_store_loop:
-    // stack: cur_address, offset, end_address, l_M_128, kexit_info
+    // stack: cur_offset, address, end_offset, l_M_128, kexit_info
     DUP3 DUP2 EQ %jumpi(expmod_store_end)
-    // stack: cur_address, offset, end_address, l_M_128, kexit_info
+    // stack: cur_offset, address, end_offset, l_M_128, kexit_info
     DUP1 %mload_current_general
-    %stack (cur_limb, cur_address, offset, end_address, l_M_128, kexit_info) ->
-         (offset, cur_limb, cur_address, end_address, l_M_128, kexit_info)
-    %stack (offset, cur_limb) -> (@SEGMENT_RETURNDATA, offset, cur_limb, 16)
-    %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
+    %stack (cur_limb, cur_offset, address, end_offset, l_M_128, kexit_info) ->
+         (address, cur_limb, cur_offset, end_offset, l_M_128, kexit_info)
+    %stack (address, cur_limb) -> (address, cur_limb, 16)
     %mstore_unpacking
-    // stack: offset', cur_address, end_address, l_M_128, kexit_info)
+    // stack: address', cur_offset, end_offset, l_M_128, kexit_info)
     SWAP1 %decrement
-    // stack: cur_address-1, offset', end_address, l_M_128, kexit_info)
+    // stack: cur_offset-1, address', end_offset, l_M_128, kexit_info)
     %jump(expmod_store_loop)
 expmod_store_end:
-    // stack: cur_address, offset, end_address, l_M_128, kexit_info
+    // stack: cur_offset, address, end_offset, l_M_128, kexit_info
     %pop4
 the_end:
     // stack: kexit_info
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/id.asm b/evm/src/cpu/kernel/asm/core/precompiles/id.asm
index 0aa0894fd0..a606ef4a85 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/id.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/id.asm
@@ -24,15 +24,20 @@ global precompile_id:
     // Simply copy the call data to the parent's return data.
     %calldatasize
     DUP1 %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE)
+
+    PUSH id_contd SWAP1
+
+    PUSH @SEGMENT_CALLDATA
     GET_CONTEXT
+    %build_address_no_offset
+    // stack: SRC, size, id_contd
+
+    PUSH @SEGMENT_RETURNDATA
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx, ctx, size) ->
-        (
-        parent_ctx, @SEGMENT_RETURNDATA, 0,  // DST
-        ctx, @SEGMENT_CALLDATA, 0,  // SRC
-        size, id_contd              // count, retdest
-        )
-    %jump(memcpy)
+    %build_address_no_offset
+
+    // stack: DST, SRC, size, id_contd
+    %jump(memcpy_bytes)
 
 id_contd:
     // stack: kexit_info
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/main.asm b/evm/src/cpu/kernel/asm/core/precompiles/main.asm
index b45b46cb0d..b7c916e9c4 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/main.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/main.asm
@@ -58,8 +58,10 @@ global handle_precompiles_from_eoa:
     %mload_txn_field(@TXN_FIELD_DATA_LEN)
     %stack (calldata_size, new_ctx) -> (calldata_size, new_ctx, calldata_size)
     %set_new_ctx_calldata_size
-    %stack (new_ctx, calldata_size) -> (new_ctx, @SEGMENT_CALLDATA, 0, 0, @SEGMENT_TXN_DATA, 0, calldata_size, handle_precompiles_from_eoa_finish, new_ctx)
-    %jump(memcpy)
+    %stack (new_ctx, calldata_size) -> (@SEGMENT_TXN_DATA, @SEGMENT_CALLDATA, new_ctx, calldata_size, handle_precompiles_from_eoa_finish, new_ctx)
+    SWAP2 %build_address_no_offset // DST
+    // stack: DST, SRC, calldata_size, handle_precompiles_from_eoa_finish, new_ctx
+    %jump(memcpy_bytes)
 
 handle_precompiles_from_eoa_finish:
     %stack (new_ctx, addr, retdest) -> (addr, new_ctx, retdest)
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/rip160.asm b/evm/src/cpu/kernel/asm/core/precompiles/rip160.asm
index 20ea42cb58..e57504961b 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/rip160.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/rip160.asm
@@ -25,34 +25,26 @@ global precompile_rip160:
     %calldatasize
     GET_CONTEXT
 
-    // The next block of code is equivalent to the following %stack macro call
-    // (unfortunately the macro call takes too long to expand dynamically).
-    //
-    //    %stack (ctx, size) ->
-    //        (
-    //        ctx, @SEGMENT_KERNEL_GENERAL, 200, // DST
-    //        ctx, @SEGMENT_CALLDATA, 0,       // SRC
-    //        size, ripemd,                    // count, retdest
-    //        200, size, rip160_contd          // ripemd input: virt, num_bytes, retdest
-    //        )
-    PUSH 200
-    PUSH ripemd
-    DUP4
-    PUSH 0
-    PUSH @SEGMENT_CALLDATA
-    PUSH rip160_contd
-    SWAP7
-    SWAP6
-    PUSH 200
-    PUSH @SEGMENT_KERNEL_GENERAL
-    DUP3
+    %stack (ctx, size) ->
+        (
+        ctx, @SEGMENT_CALLDATA,            // SRC
+        ctx,
+        size, ripemd,                      // count, retdest
+        200, size, rip160_contd            // ripemd input: virt, num_bytes, retdest
+        )
+    %build_address_no_offset
+    %stack(addr, ctx) -> (ctx, @SEGMENT_KERNEL_GENERAL, 200, addr)
+    %build_address
+    // stack: DST, SRC, count, retdest, virt, num_bytes, retdest
 
-    %jump(memcpy)
+    %jump(memcpy_bytes)
 
 rip160_contd:
     // stack: hash, kexit_info
     // Store the result hash to the parent's return data using `mstore_unpacking`.
     %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 32)
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx, hash) -> (parent_ctx, @SEGMENT_RETURNDATA, 0, hash, 32, pop_and_return_success)
-    %jump(mstore_unpacking)
+    %stack (parent_ctx, hash) -> (parent_ctx, @SEGMENT_RETURNDATA, hash)
+    %build_address_no_offset
+    MSTORE_32BYTES_32
+    %jump(pop_and_return_success)
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/sha256.asm b/evm/src/cpu/kernel/asm/core/precompiles/sha256.asm
index 97cf0f026f..3c926f0bbd 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/sha256.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/sha256.asm
@@ -24,37 +24,27 @@ global precompile_sha256:
     // Copy the call data to the kernel general segment (sha2 expects it there) and call sha2.
     %calldatasize
     GET_CONTEXT
-    // stack: ctx, size
 
-    // The next block of code is equivalent to the following %stack macro call
-    // (unfortunately the macro call takes too long to expand dynamically).
-    //
-    //    %stack (ctx, size) ->
-    //        (
-    //        ctx, @SEGMENT_KERNEL_GENERAL, 1, // DST
-    //        ctx, @SEGMENT_CALLDATA, 0,     // SRC
-    //        size, sha2,                    // count, retdest
-    //        0, size, sha256_contd          // sha2 input: virt, num_bytes, retdest
-    //        )
-    //
-    PUSH 0
-    PUSH sha2
-    DUP4
-    PUSH 0
-    PUSH @SEGMENT_CALLDATA
-    PUSH sha256_contd
-    SWAP7
-    SWAP6
-    PUSH 1
-    PUSH @SEGMENT_KERNEL_GENERAL
-    DUP3
+    %stack (ctx, size) ->
+        (
+        ctx, @SEGMENT_CALLDATA,          // SRC
+        ctx,
+        size, sha2,                      // count, retdest
+        0, size, sha256_contd            // sha2 input: virt, num_bytes, retdest
+        )
+    %build_address_no_offset
+    %stack(addr, ctx) -> (ctx, @SEGMENT_KERNEL_GENERAL, 1, addr)
+    %build_address
+    // stack: DST, SRC, count, retdest, virt, num_bytes, retdest
 
-    %jump(memcpy)
+    %jump(memcpy_bytes)
 
 sha256_contd:
     // stack: hash, kexit_info
     // Store the result hash to the parent's return data using `mstore_unpacking`.
     %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 32)
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx, hash) -> (parent_ctx, @SEGMENT_RETURNDATA, 0, hash, 32, pop_and_return_success)
-    %jump(mstore_unpacking)
+    %stack (parent_ctx, hash) -> (parent_ctx, @SEGMENT_RETURNDATA, hash)
+    %build_address_no_offset
+    MSTORE_32BYTES_32
+    %jump(pop_and_return_success)
diff --git a/evm/src/cpu/kernel/asm/core/precompiles/snarkv.asm b/evm/src/cpu/kernel/asm/core/precompiles/snarkv.asm
index f128cd51ad..23ad9eb17d 100644
--- a/evm/src/cpu/kernel/asm/core/precompiles/snarkv.asm
+++ b/evm/src/cpu/kernel/asm/core/precompiles/snarkv.asm
@@ -30,41 +30,47 @@ loading_loop:
     DUP1 %mul_const(192)
     // stack: px, i, k, kexit_info
     GET_CONTEXT
-    %stack (ctx, px) -> (ctx, @SEGMENT_CALLDATA, px, 32, loading_loop_contd, px)
-    %jump(mload_packing)
+    %stack (ctx, px) -> (ctx, @SEGMENT_CALLDATA, px, 32, px)
+    %build_address
+    MLOAD_32BYTES
 loading_loop_contd:
     // stack: x, px, i, k, kexit_info
     SWAP1 %add_const(32)
     GET_CONTEXT
-    %stack (ctx, py) -> (ctx, @SEGMENT_CALLDATA, py, 32, loading_loop_contd2, py)
-    %jump(mload_packing)
+    %stack (ctx, py) -> (ctx, @SEGMENT_CALLDATA, py, 32, py)
+    %build_address
+    MLOAD_32BYTES
 loading_loop_contd2:
     // stack: y, py, x, i, k, kexit_info
     SWAP1 %add_const(32)
     GET_CONTEXT
-    %stack (ctx, px_im) -> (ctx, @SEGMENT_CALLDATA, px_im, 32, loading_loop_contd3, px_im)
-    %jump(mload_packing)
+    %stack (ctx, px_im) -> (ctx, @SEGMENT_CALLDATA, px_im, 32, px_im)
+    %build_address
+    MLOAD_32BYTES
 loading_loop_contd3:
     // stack: x_im, px_im, y, x, i, k, kexit_info
     SWAP1 %add_const(32)
     // stack: px_re, x_im, y, x, i, k, kexit_info
     GET_CONTEXT
-    %stack (ctx, px_re) -> (ctx, @SEGMENT_CALLDATA, px_re, 32, loading_loop_contd4, px_re)
-    %jump(mload_packing)
+    %stack (ctx, px_re) -> (ctx, @SEGMENT_CALLDATA, px_re, 32, px_re)
+    %build_address
+    MLOAD_32BYTES
 loading_loop_contd4:
     // stack: x_re, px_re, x_im, y, x, i, k, kexit_info
     SWAP1 %add_const(32)
     // stack: py_im, x_re, x_im, y, x, i, k, kexit_info
     GET_CONTEXT
-    %stack (ctx, py_im) -> (ctx, @SEGMENT_CALLDATA, py_im, 32, loading_loop_contd5, py_im)
-    %jump(mload_packing)
+    %stack (ctx, py_im) -> (ctx, @SEGMENT_CALLDATA, py_im, 32, py_im)
+    %build_address
+    MLOAD_32BYTES
 loading_loop_contd5:
     // stack: y_im, py_im, x_re, x_im, y, x, i, k, kexit_info
     SWAP1 %add_const(32)
     // stack: py_re, y_im, x_re, x_im, y, x, i, k, kexit_info
     GET_CONTEXT
-    %stack (ctx, py_re) -> (ctx, @SEGMENT_CALLDATA, py_re, 32, loading_loop_contd6)
-    %jump(mload_packing)
+    %stack (ctx, py_re) -> (ctx, @SEGMENT_CALLDATA, py_re, 32)
+    %build_address
+    MLOAD_32BYTES
 loading_loop_contd6:
     // stack: y_re, y_im, x_re, x_im, y, x, i, k, kexit_info
     SWAP1  // the EVM serializes the imaginary part first
@@ -118,5 +124,7 @@ got_result:
     // Store the result bool (repr. by a U256) to the parent's return data using `mstore_unpacking`.
     %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 32)
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx, address) -> (parent_ctx, @SEGMENT_RETURNDATA, 0, address, 32, pop_and_return_success)
-    %jump(mstore_unpacking)
+    %stack (parent_ctx, address) -> (parent_ctx, @SEGMENT_RETURNDATA, address)
+    %build_address_no_offset
+    MSTORE_32BYTES_32
+    %jump(pop_and_return_success)
diff --git a/evm/src/cpu/kernel/asm/core/process_txn.asm b/evm/src/cpu/kernel/asm/core/process_txn.asm
index 779acab1da..c70287a6f9 100644
--- a/evm/src/cpu/kernel/asm/core/process_txn.asm
+++ b/evm/src/cpu/kernel/asm/core/process_txn.asm
@@ -12,11 +12,11 @@ global process_normalized_txn:
 
     // Compute this transaction's intrinsic gas and store it.
     %intrinsic_gas
+    DUP1
     %mstore_txn_field(@TXN_FIELD_INTRINSIC_GAS)
-    // stack: retdest
+    // stack: intrinsic_gas, retdest
 
     // Assert gas_limit >= intrinsic_gas.
-    %mload_txn_field(@TXN_FIELD_INTRINSIC_GAS)
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
     %assert_ge(invalid_txn)
 
@@ -145,25 +145,22 @@ global process_contract_creation_txn:
     // stack: new_ctx, address, retdest
 
     // Store constructor code length
-    %mload_txn_field(@TXN_FIELD_DATA_LEN)
-    // stack: data_len, new_ctx, address, retdest
     PUSH @CTX_METADATA_CODE_SIZE
-    PUSH @SEGMENT_CONTEXT_METADATA
-    // stack: segment, offset, data_len, new_ctx, address, retdest
-    DUP4 // new_ctx
+    // stack: offset, new_ctx, address, retdest
+    DUP2 // new_ctx
+    ADD // CTX_METADATA_CODE_SIZE is already scaled by its segment
+    // stack: addr, new_ctx, address, retdest
+    %mload_txn_field(@TXN_FIELD_DATA_LEN)
+    // stack: data_len, addr, new_ctx, address, retdest
     MSTORE_GENERAL
     // stack: new_ctx, address, retdest
 
     // Copy the code from txdata to the new context's code segment.
     PUSH process_contract_creation_txn_after_code_loaded
     %mload_txn_field(@TXN_FIELD_DATA_LEN)
-    PUSH 0 // SRC.offset
-    PUSH @SEGMENT_TXN_DATA // SRC.segment
-    PUSH 0 // SRC.context
-    PUSH 0 // DST.offset
-    PUSH @SEGMENT_CODE // DST.segment
-    DUP8 // DST.context = new_ctx
-    %jump(memcpy)
+    PUSH @SEGMENT_TXN_DATA // SRC (context == offset == 0)
+    DUP4 // DST (segment == 0 (i.e. CODE), and offset == 0)
+    %jump(memcpy_bytes)
 
 global process_contract_creation_txn_after_code_loaded:
     // stack: new_ctx, address, retdest
@@ -203,9 +200,11 @@ global process_contract_creation_txn_after_constructor:
 
     // Store the code hash of the new contract.
     // stack: leftover_gas, new_ctx, address, retdest, success
-    GET_CONTEXT
     %returndatasize
-    %stack (size, ctx) -> (ctx, @SEGMENT_RETURNDATA, 0, size) // context, segment, offset, len
+    PUSH @SEGMENT_RETURNDATA
+    GET_CONTEXT
+    %build_address_no_offset
+    // stack: addr, len
     KECCAK_GENERAL
     // stack: codehash, leftover_gas, new_ctx, address, retdest, success
     %observe_new_contract
@@ -248,11 +247,10 @@ global process_message_txn:
     %create_context
     // stack: new_ctx, retdest
     PUSH process_message_txn_code_loaded
-    PUSH @SEGMENT_CODE
-    DUP3 // new_ctx
+    DUP2 // new_ctx
     %mload_txn_field(@TXN_FIELD_TO)
-    // stack: address, new_ctx, segment, process_message_txn_code_loaded, new_ctx, retdest
-    %jump(load_code)
+    // stack: address, new_ctx, process_message_txn_code_loaded, new_ctx, retdest
+    %jump(load_code_padded)
 
 global process_message_txn_insufficient_balance:
     // stack: retdest
@@ -293,8 +291,9 @@ global process_message_txn_code_loaded:
     %mload_txn_field(@TXN_FIELD_DATA_LEN)
     %stack (calldata_size, new_ctx, retdest) -> (calldata_size, new_ctx, calldata_size, retdest)
     %set_new_ctx_calldata_size
-    %stack (new_ctx, calldata_size, retdest) -> (new_ctx, @SEGMENT_CALLDATA, 0, 0, @SEGMENT_TXN_DATA, 0, calldata_size, process_message_txn_code_loaded_finish, new_ctx, retdest)
-    %jump(memcpy)
+    %stack (new_ctx, calldata_size, retdest) -> (new_ctx, @SEGMENT_CALLDATA, @SEGMENT_TXN_DATA, calldata_size, process_message_txn_code_loaded_finish, new_ctx, retdest)
+    %build_address_no_offset // DST
+    %jump(memcpy_bytes)
 
 process_message_txn_code_loaded_finish:
     %enter_new_ctx
@@ -452,22 +451,22 @@ global invalid_txn:
     POP
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
     PUSH 0
-    %jump(txn_loop_after)
+    %jump(txn_after)
 
 global invalid_txn_1:
     %pop2
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
     PUSH 0
-    %jump(txn_loop_after)
+    %jump(txn_after)
 
 global invalid_txn_2:
     %pop3
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
     PUSH 0
-    %jump(txn_loop_after)
+    %jump(txn_after)
 
 global invalid_txn_3:
     %pop4
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
     PUSH 0
-    %jump(txn_loop_after)
+    %jump(txn_after)
diff --git a/evm/src/cpu/kernel/asm/core/selfdestruct_list.asm b/evm/src/cpu/kernel/asm/core/selfdestruct_list.asm
index b3903f71a0..05e158c340 100644
--- a/evm/src/cpu/kernel/asm/core/selfdestruct_list.asm
+++ b/evm/src/cpu/kernel/asm/core/selfdestruct_list.asm
@@ -5,8 +5,9 @@
 %macro insert_selfdestruct_list
     // stack: addr
     %mload_global_metadata(@GLOBAL_METADATA_SELFDESTRUCT_LIST_LEN)
-    %stack (len, addr) -> (len, addr, len)
-    %mstore_kernel(@SEGMENT_SELFDESTRUCT_LIST) // Store new address at the end of the array.
+    DUP1 PUSH @SEGMENT_SELFDESTRUCT_LIST %build_kernel_address
+    %stack (write_addr, len, addr) -> (addr, write_addr, len)
+    MSTORE_GENERAL // Store new address at the end of the array.
     // stack: len
     %increment
     %mstore_global_metadata(@GLOBAL_METADATA_SELFDESTRUCT_LIST_LEN) // Store new length.
@@ -18,12 +19,14 @@ global remove_selfdestruct_list:
     // stack: addr, retdest
     %mload_global_metadata(@GLOBAL_METADATA_SELFDESTRUCT_LIST_LEN)
     // stack: len, addr, retdest
-    PUSH 0
+    PUSH @SEGMENT_SELFDESTRUCT_LIST ADD
+    PUSH @SEGMENT_SELFDESTRUCT_LIST
 remove_selfdestruct_list_loop:
+    // `i` and `len` are both scaled by SEGMENT_SELFDESTRUCT_LIST
     %stack (i, len, addr, retdest) -> (i, len, i, len, addr, retdest)
     EQ %jumpi(remove_selfdestruct_not_found)
     // stack: i, len, addr, retdest
-    DUP1 %mload_kernel(@SEGMENT_SELFDESTRUCT_LIST)
+    DUP1 MLOAD_GENERAL
     // stack: loaded_addr, i, len, addr, retdest
     DUP4
     // stack: addr, loaded_addr, i, len, addr, retdest
@@ -33,12 +36,14 @@ remove_selfdestruct_list_loop:
     %jump(remove_selfdestruct_list_loop)
 remove_selfdestruct_list_found:
     %stack (i, len, addr, retdest) -> (len, 1, i, retdest)
-    SUB DUP1 %mstore_global_metadata(@GLOBAL_METADATA_SELFDESTRUCT_LIST_LEN) // Decrement the list length.
+    SUB
+    PUSH @SEGMENT_SELFDESTRUCT_LIST
+    DUP2 SUB // unscale
+    %mstore_global_metadata(@GLOBAL_METADATA_SELFDESTRUCT_LIST_LEN) // Decrement the list length.
     // stack: len-1, i, retdest
-    %mload_kernel(@SEGMENT_SELFDESTRUCT_LIST) // Load the last address in the list.
+    MLOAD_GENERAL // Load the last address in the list.
     // stack: last_addr, i, retdest
-    SWAP1
-    %mstore_kernel(@SEGMENT_SELFDESTRUCT_LIST) // Store the last address at the position of the removed address.
+    MSTORE_GENERAL // Store the last address at the position of the removed address.
     JUMP
 remove_selfdestruct_not_found:
     // stack: i, len, addr, retdest
@@ -49,12 +54,14 @@ global delete_all_selfdestructed_addresses:
     // stack: retdest
     %mload_global_metadata(@GLOBAL_METADATA_SELFDESTRUCT_LIST_LEN)
     // stack: len, retdest
-    PUSH 0
+    PUSH @SEGMENT_SELFDESTRUCT_LIST ADD
+    PUSH @SEGMENT_SELFDESTRUCT_LIST
 delete_all_selfdestructed_addresses_loop:
+    // `i` and `len` are both scaled by SEGMENT_SELFDESTRUCT_LIST
     // stack: i, len, retdest
     DUP2 DUP2 EQ %jumpi(delete_all_selfdestructed_addresses_done)
     // stack: i, len, retdest
-    DUP1 %mload_kernel(@SEGMENT_SELFDESTRUCT_LIST)
+    DUP1 MLOAD_GENERAL
     // stack: loaded_addr, i, len, retdest
     DUP1 %is_non_existent ISZERO %jumpi(bingo)
     // stack: loaded_addr, i, len, retdest
diff --git a/evm/src/cpu/kernel/asm/core/syscall.asm b/evm/src/cpu/kernel/asm/core/syscall.asm
index 9bfcb24543..673a5fbb90 100644
--- a/evm/src/cpu/kernel/asm/core/syscall.asm
+++ b/evm/src/cpu/kernel/asm/core/syscall.asm
@@ -128,14 +128,9 @@ global syscall_jumptable:
         JUMPTABLE panic // 0xb0-0xbf are invalid opcodes
     %endrep
 
-    // 0xc0-0xcf
-    %rep 16
-        JUMPTABLE panic // 0xc0-0xcf are invalid opcodes
-    %endrep
-
-    // 0xd0-0xdf
-    %rep 16
-        JUMPTABLE panic // 0xd0-0xdf are invalid opcodes
+    // 0xc0-0xdf
+    %rep 32
+        JUMPTABLE panic // mstore_32bytes_1-32 are implemented natively
     %endrep
 
     // 0xe0-0xef
diff --git a/evm/src/cpu/kernel/asm/core/terminate.asm b/evm/src/cpu/kernel/asm/core/terminate.asm
index 6811234a9a..6ae04e9fd6 100644
--- a/evm/src/cpu/kernel/asm/core/terminate.asm
+++ b/evm/src/cpu/kernel/asm/core/terminate.asm
@@ -6,10 +6,6 @@ global sys_stop:
     // Set the parent context's return data size to 0.
     %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 0)
 
-    // This makes sure the gas used hasn't overflowed the gaslimit.
-    // This could happen when executing a native instruction (i.e. not a syscall).
-    %charge_gas_const(0)
-
     %leftover_gas
     // stack: leftover_gas
     PUSH 1 // success
@@ -33,19 +29,27 @@ return_after_gas:
 
     // Store the return data size in the parent context's metadata.
     %stack (parent_ctx, kexit_info, offset, size) ->
-        (parent_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_RETURNDATA_SIZE, size, offset, size, parent_ctx, kexit_info)
+        (parent_ctx, @CTX_METADATA_RETURNDATA_SIZE, size, offset, size, parent_ctx, kexit_info)
+    ADD // addr (CTX offsets are already scaled by their segment)
+    SWAP1
+    // stack: size, addr, offset, size, parent_ctx, kexit_info
     MSTORE_GENERAL
     // stack: offset, size, parent_ctx, kexit_info
 
     // Store the return data in the parent context's returndata segment.
+    PUSH @SEGMENT_MAIN_MEMORY
     GET_CONTEXT
-    %stack (ctx, offset, size, parent_ctx, kexit_info) ->
+    %build_address
+
+    %stack (addr, size, parent_ctx, kexit_info) ->
         (
-        parent_ctx, @SEGMENT_RETURNDATA, 0, // DST
-        ctx, @SEGMENT_MAIN_MEMORY, offset,  // SRC
+        parent_ctx, @SEGMENT_RETURNDATA, // DST
+        addr, // SRC
         size, sys_return_finish, kexit_info // count, retdest, ...
         )
-    %jump(memcpy)
+    %build_address_no_offset
+    // stack: DST, SRC, size, sys_return_finish, kexit_info
+    %jump(memcpy_bytes)
 
 sys_return_finish:
     // stack: kexit_info
@@ -147,19 +151,27 @@ revert_after_gas:
 
     // Store the return data size in the parent context's metadata.
     %stack (parent_ctx, kexit_info, offset, size) ->
-        (parent_ctx, @SEGMENT_CONTEXT_METADATA, @CTX_METADATA_RETURNDATA_SIZE, size, offset, size, parent_ctx, kexit_info)
+        (parent_ctx, @CTX_METADATA_RETURNDATA_SIZE, size, offset, size, parent_ctx, kexit_info)
+    ADD // addr (CTX offsets are already scaled by their segment)
+    SWAP1
+    // stack: size, addr, offset, size, parent_ctx, kexit_info
     MSTORE_GENERAL
     // stack: offset, size, parent_ctx, kexit_info
 
     // Store the return data in the parent context's returndata segment.
+    PUSH @SEGMENT_MAIN_MEMORY
     GET_CONTEXT
-    %stack (ctx, offset, size, parent_ctx, kexit_info) ->
+    %build_address
+
+    %stack (addr, size, parent_ctx, kexit_info) ->
         (
-        parent_ctx, @SEGMENT_RETURNDATA, 0, // DST
-        ctx, @SEGMENT_MAIN_MEMORY, offset,  // SRC
+        parent_ctx, @SEGMENT_RETURNDATA, // DST
+        addr,  // SRC
         size, sys_revert_finish, kexit_info // count, retdest, ...
         )
-    %jump(memcpy)
+    %build_address_no_offset
+    // stack: DST, SRC, size, sys_revert_finish, kexit_info
+    %jump(memcpy_bytes)
 
 sys_revert_finish:
     %leftover_gas
@@ -205,9 +217,11 @@ global terminate_common:
 
     // Similarly, we write the parent PC to SEGMENT_KERNEL_GENERAL[2] so that
     // we can later read it after switching to the parent context.
-    %mload_context_metadata(@CTX_METADATA_PARENT_PC)
     PUSH 2
-    %mstore_kernel(@SEGMENT_KERNEL_GENERAL)
+    PUSH @SEGMENT_KERNEL_GENERAL
+    %build_kernel_address
+    %mload_context_metadata(@CTX_METADATA_PARENT_PC)
+    MSTORE_GENERAL
     // stack: (empty)
 
     // Go back to the parent context.
diff --git a/evm/src/cpu/kernel/asm/core/touched_addresses.asm b/evm/src/cpu/kernel/asm/core/touched_addresses.asm
index f2c0394a66..d9c70f47ac 100644
--- a/evm/src/cpu/kernel/asm/core/touched_addresses.asm
+++ b/evm/src/cpu/kernel/asm/core/touched_addresses.asm
@@ -15,12 +15,14 @@ global insert_touched_addresses:
     // stack: addr, retdest
     %mload_global_metadata(@GLOBAL_METADATA_TOUCHED_ADDRESSES_LEN)
     // stack: len, addr, retdest
-    PUSH 0
+    PUSH @SEGMENT_TOUCHED_ADDRESSES ADD
+    PUSH @SEGMENT_TOUCHED_ADDRESSES
 insert_touched_addresses_loop:
+    // `i` and `len` are both scaled by SEGMENT_TOUCHED_ADDRESSES
     %stack (i, len, addr, retdest) -> (i, len, i, len, addr, retdest)
     EQ %jumpi(insert_address)
     // stack: i, len, addr, retdest
-    DUP1 %mload_kernel(@SEGMENT_TOUCHED_ADDRESSES)
+    DUP1 MLOAD_GENERAL
     // stack: loaded_addr, i, len, addr, retdest
     DUP4
     // stack: addr, loaded_addr, i, len, addr, retdest
@@ -30,10 +32,11 @@ insert_touched_addresses_loop:
     %jump(insert_touched_addresses_loop)
 
 insert_address:
-    %stack (i, len, addr, retdest) -> (i, addr, len, retdest)
+    %stack (i, len, addr, retdest) -> (i, addr, len, @SEGMENT_TOUCHED_ADDRESSES, retdest)
     DUP2 %journal_add_account_touched // Add a journal entry for the touched account.
-    %mstore_kernel(@SEGMENT_TOUCHED_ADDRESSES) // Store new address at the end of the array.
-    // stack: len, retdest
+    %swap_mstore // Store new address at the end of the array.
+    // stack: len, segment, retdest
+    SUB // unscale
     %increment
     %mstore_global_metadata(@GLOBAL_METADATA_TOUCHED_ADDRESSES_LEN) // Store new length.
     JUMP
@@ -49,12 +52,14 @@ global remove_touched_addresses:
     // stack: addr, retdest
     %mload_global_metadata(@GLOBAL_METADATA_TOUCHED_ADDRESSES_LEN)
     // stack: len, addr, retdest
-    PUSH 0
+    PUSH @SEGMENT_TOUCHED_ADDRESSES ADD
+    PUSH @SEGMENT_TOUCHED_ADDRESSES
 remove_touched_addresses_loop:
+    // `i` and `len` are both scaled by SEGMENT_TOUCHED_ADDRESSES
     %stack (i, len, addr, retdest) -> (i, len, i, len, addr, retdest)
     EQ %jumpi(panic)
     // stack: i, len, addr, retdest
-    DUP1 %mload_kernel(@SEGMENT_TOUCHED_ADDRESSES)
+    DUP1 MLOAD_GENERAL
     // stack: loaded_addr, i, len, addr, retdest
     DUP4
     // stack: addr, loaded_addr, i, len, addr, retdest
@@ -64,12 +69,14 @@ remove_touched_addresses_loop:
     %jump(remove_touched_addresses_loop)
 remove_touched_addresses_found:
     %stack (i, len, addr, retdest) -> (len, 1, i, retdest)
-    SUB DUP1 %mstore_global_metadata(@GLOBAL_METADATA_TOUCHED_ADDRESSES_LEN) // Decrement the list length.
+    SUB
+    PUSH @SEGMENT_TOUCHED_ADDRESSES DUP2
+    SUB // unscale
+    %mstore_global_metadata(@GLOBAL_METADATA_TOUCHED_ADDRESSES_LEN) // Decrement the list length.
     // stack: len-1, i, retdest
-    %mload_kernel(@SEGMENT_TOUCHED_ADDRESSES) // Load the last address in the list.
+    MLOAD_GENERAL // Load the last address in the list.
     // stack: last_addr, i, retdest
-    SWAP1
-    %mstore_kernel(@SEGMENT_TOUCHED_ADDRESSES) // Store the last address at the position of the removed address.
+    MSTORE_GENERAL // Store the last address at the position of the removed address.
     JUMP
 
 
@@ -77,12 +84,14 @@ global delete_all_touched_addresses:
     // stack: retdest
     %mload_global_metadata(@GLOBAL_METADATA_TOUCHED_ADDRESSES_LEN)
     // stack: len, retdest
-    PUSH 0
+    PUSH @SEGMENT_TOUCHED_ADDRESSES ADD
+    PUSH @SEGMENT_TOUCHED_ADDRESSES
 delete_all_touched_addresses_loop:
+    // `i` and `len` are both scaled by SEGMENT_TOUCHED_ADDRESSES
     // stack: i, len, retdest
     DUP2 DUP2 EQ %jumpi(delete_all_touched_addresses_done)
     // stack: i, len, retdest
-    DUP1 %mload_kernel(@SEGMENT_TOUCHED_ADDRESSES)
+    DUP1 MLOAD_GENERAL
     // stack: loaded_addr, i, len, retdest
     DUP1 %is_empty %jumpi(bingo)
     // stack: loaded_addr, i, len, retdest
diff --git a/evm/src/cpu/kernel/asm/core/util.asm b/evm/src/cpu/kernel/asm/core/util.asm
index ee33ff26ca..a77329bd8c 100644
--- a/evm/src/cpu/kernel/asm/core/util.asm
+++ b/evm/src/cpu/kernel/asm/core/util.asm
@@ -11,7 +11,7 @@
 %macro next_context_id
     // stack: (empty)
     %mload_global_metadata(@GLOBAL_METADATA_LARGEST_CONTEXT)
-    %increment
+    %add_const(0x10000000000000000) // scale each context by 2^64
     // stack: new_ctx
     DUP1
     %mstore_global_metadata(@GLOBAL_METADATA_LARGEST_CONTEXT)
@@ -83,7 +83,6 @@
     SET_CONTEXT
     // stack: (empty)
     // We can now read this stack length from memory.
-    push @CTX_METADATA_STACK_SIZE
-    %mload_current(@SEGMENT_CONTEXT_METADATA)
+    %mload_context_metadata(@CTX_METADATA_STACK_SIZE)
     // stack: stack_length
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/core/withdrawals.asm b/evm/src/cpu/kernel/asm/core/withdrawals.asm
new file mode 100644
index 0000000000..3be05d880c
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/core/withdrawals.asm
@@ -0,0 +1,25 @@
+%macro withdrawals
+    // stack: (empty)
+    PUSH %%after
+    %jump(withdrawals)
+%%after:
+    // stack: (empty)
+%endmacro
+
+global withdrawals:
+    // stack: retdest
+    PROVER_INPUT(withdrawal)
+    // stack: address, retdest
+    PROVER_INPUT(withdrawal)
+    // stack: amount, address, retdest
+    DUP2 %eq_const(@U256_MAX) %jumpi(withdrawals_end)
+    SWAP1
+    // stack: address, amount, retdest
+    %add_eth
+    // stack: retdest
+    %jump(withdrawals)
+
+withdrawals_end:
+    // stack: amount, address, retdest
+    %pop2
+    JUMP
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/curve_mul.asm b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/curve_mul.asm
index ecbb3de009..93864c5519 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/curve_mul.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/curve_mul.asm
@@ -27,12 +27,12 @@ bn_mul_valid_point:
 bn_mul_after_glv:
     // stack: bneg, a, b, x, y, bn_msm, bn_mul_end, retdest
     // Store bneg at this (otherwise unused) location. Will be used later in the MSM.
-    %mstore_kernel(@SEGMENT_KERNEL_BN_TABLE_Q, @BN_BNEG_LOC)
+    %mstore_current(@SEGMENT_BN_TABLE_Q, @BN_BNEG_LOC)
     // stack: a, b, x, y, bn_msm, bn_mul_end, retdest
-    PUSH bn_mul_after_a SWAP1 PUSH @SEGMENT_KERNEL_BN_WNAF_A PUSH @BN_SCALAR %jump(wnaf)
+    PUSH bn_mul_after_a SWAP1 PUSH @SEGMENT_BN_WNAF_A PUSH @BN_SCALAR %jump(wnaf)
 bn_mul_after_a:
     // stack: b, x, y, bn_msm, bn_mul_end, retdest
-    PUSH bn_mul_after_b SWAP1 PUSH @SEGMENT_KERNEL_BN_WNAF_B PUSH @BN_SCALAR %jump(wnaf)
+    PUSH bn_mul_after_b SWAP1 PUSH @SEGMENT_BN_WNAF_B PUSH @BN_SCALAR %jump(wnaf)
 bn_mul_after_b:
     // stack: x, y, bn_msm, bn_mul_end, retdest
     %jump(bn_precompute_table)
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/final_exponent.asm b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/final_exponent.asm
index d1f32ce65a..035cb43830 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/final_exponent.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/final_exponent.asm
@@ -56,14 +56,21 @@ final_exp:
     %stack (val) -> (val, 0, val)
     // stack:        val, 0, val, retdest
     %move_fp254_12
-    // stack:             0, val, retdest  {0: sqr}
-    %stack () -> (1, 1, 1)
-    // stack:    1, 1, 1, 0, val, retdest
-    %mstore_bn254_pairing(12)
-    %mstore_bn254_pairing(24)
-    %mstore_bn254_pairing(36)
-    // stack:             0, val, retdest  {0: sqr, 12: y0, 24: y2, 36: y4}
-    %stack () -> (64, 62, 65)
+    // dest addr returned by %move_fp254_12 is already scaled
+    // stack:          addr, val, retdest  {0: sqr}
+
+    // Write 1s at offset 12, 24 and 36
+    PUSH 12
+    ADD
+    DUP1 %add_const(12)
+    DUP1 %add_const(12)
+    // stack: addr_1, addr_2, addr_3
+    %rep 3
+        PUSH 1 MSTORE_GENERAL
+    %endrep
+
+    // stack:             val, retdest  {0: sqr, 12: y0, 24: y2, 36: y4}
+    %stack () -> (64, 62, 65, 0)
     // stack: 64, 62, 65, 0, val, retdest  {0: sqr, 12: y0, 24: y2, 36: y4}
     %jump(power_loop_4)
 
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/miller_loop.asm b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/miller_loop.asm
index 3b4ded57e2..99cf24e71d 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/miller_loop.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/miller_loop.asm
@@ -27,9 +27,9 @@
 
 global bn254_miller:
     // stack:            ptr, out, retdest
-    %stack (ptr, out) -> (out, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ptr, out)
-    // stack: out, unit, ptr, out, retdest
-    %store_fp254_12
+    %stack (ptr, out) -> (out, ptr, out)
+    // stack: out, ptr, out, retdest
+    %write_fp254_12_unit
     // stack:            ptr, out, retdest
     %load_fp254_6
     // stack:           P, Q, out, retdest
@@ -39,7 +39,7 @@ global bn254_miller:
 miller_loop:
     POP
     // stack:          times  , O, P, Q, out, retdest
-    DUP1  
+    DUP1
     ISZERO
     // stack:  break?, times  , O, P, Q, out, retdest
     %jumpi(miller_return)
@@ -60,7 +60,7 @@ miller_return:
 
 miller_one:
     // stack:               0xnm, times, O, P, Q, out, retdest
-    DUP1  
+    DUP1
     %lt_const(0x20) 
     // stack:        skip?, 0xnm, times, O, P, Q, out, retdest
     %jumpi(miller_zero)
@@ -73,7 +73,7 @@ miller_one:
 
 miller_zero:
     // stack:              m  , times, O, P, Q, out, retdest
-    DUP1  
+    DUP1
     ISZERO
     // stack:       skip?, m  , times, O, P, Q, out, retdest
     %jumpi(miller_loop)
@@ -93,8 +93,8 @@ miller_zero:
 
 mul_tangent:
     // stack:                                              retdest, 0xnm, times, O, P, Q, out
-    PUSH mul_tangent_2  
-    DUP13  
+    PUSH mul_tangent_2
+    DUP13
     PUSH mul_tangent_1
     // stack:           mul_tangent_1, out, mul_tangent_2, retdest, 0xnm, times, O, P, Q, out
     %stack (mul_tangent_1, out) -> (out, out, mul_tangent_1, out)
@@ -107,7 +107,7 @@ mul_tangent_1:
     DUP13
     DUP13
     // stack:       Q, out, mul_tangent_2, retdest, 0xnm, times, O, P, Q, out
-    DUP11  
+    DUP11
     DUP11
     // stack:    O, Q, out, mul_tangent_2, retdest, 0xnm, times, O, P, Q, out
     %tangent
@@ -141,15 +141,15 @@ mul_cord:
     // stack:                           0xnm, times, O, P, Q, out
     PUSH mul_cord_1
     // stack:               mul_cord_1, 0xnm, times, O, P, Q, out
-    DUP11  
-    DUP11  
-    DUP11  
+    DUP11
+    DUP11
+    DUP11
     DUP11
     // stack:            Q, mul_cord_1, 0xnm, times, O, P, Q, out
-    DUP9  
+    DUP9
     DUP9
     // stack:         O, Q, mul_cord_1, 0xnm, times, O, P, Q, out
-    DUP13  
+    DUP13
     DUP13
     // stack:      P, O, Q, mul_cord_1, 0xnm, times, O, P, Q, out
     %cord 
@@ -188,43 +188,51 @@ after_add:
 
 %macro tangent
     // stack:                px, py, qx, qx_,  qy, qy_
-    %stack (px, py) -> (py, py , 9, px, py)
-    // stack:    py, py , 9, px, py, qx, qx_,  qy, qy_
+    PUSH 12
+    %create_bn254_pairing_address
+    %stack (addr12, px, py) -> (py, py, 9, addr12, addr12, px, py)
+    // stack:    py, py, 9, addr12, addr12, px, py, qx, qx_,  qy, qy_
     MULFP254
-    // stack:      py^2 , 9, px, py, qx, qx_,  qy, qy_
+    // stack:      py^2, 9, addr12, addr12, px, py, qx, qx_,  qy, qy_
     SUBFP254
-    // stack:      py^2 - 9, px, py, qx, qx_,  qy, qy_
-    %mstore_bn254_pairing(12)
-    // stack:                px, py, qx, qx_,  qy, qy_
-    DUP1  
+    // stack:      py^2 - 9, addr12, addr12, px, py, qx, qx_,  qy, qy_
+    MSTORE_GENERAL
+    // stack:               addr12, px, py, qx, qx_,  qy, qy_
+    %add_const(2) DUP1
+    SWAP2
+    DUP1
     MULFP254
-    // stack:              px^2, py, qx, qx_,  qy, qy_
-    PUSH 3  
+    // stack:              px^2, addr14, addr14, py, qx, qx_,  qy, qy_
+    PUSH 3
     MULFP254
-    // stack:            3*px^2, py, qx, qx_,  qy, qy_
-    PUSH 0  
+    // stack:            3*px^2, addr14, addr14, py, qx, qx_,  qy, qy_
+    PUSH 0
     SUBFP254
-    // stack:           -3*px^2, py, qx, qx_,  qy, qy_
-    SWAP2
-    // stack:            qx, py, -3px^2, qx_,  qy, qy_
-    DUP3  
+    // stack:           -3*px^2, addr14, addr14, py, qx, qx_,  qy, qy_
+    SWAP4
+    // stack:            qx, addr14, addr14, py, -3px^2, qx_,  qy, qy_
+    DUP5
     MULFP254
-    // stack:   (-3*px^2)qx, py, -3px^2, qx_,  qy, qy_ 
-    %mstore_bn254_pairing(14)
-    // stack:                py, -3px^2, qx_,  qy, qy_ 
-    PUSH 2  
+    // stack:   (-3*px^2)qx, addr14, addr14, py, -3px^2, qx_,  qy, qy_
+    MSTORE_GENERAL
+    // stack:                addr14, py, -3px^2, qx_,  qy, qy_ 
+    DUP1 %add_const(6)
+    // stack:                addr20, addr14, py, -3px^2, qx_,  qy, qy_
+    %stack (addr20, addr14, py) -> (2, py, addr20, addr14)
     MULFP254
-    // stack:               2py, -3px^2, qx_,  qy, qy_ 
-    SWAP3 
-    // stack:                qy, -3px^2, qx_, 2py, qy_ 
-    DUP4  
+    // stack:               2py, addr20, addr14, -3px^2, qx_,  qy, qy_ 
+    SWAP5
+    // stack:                qy, addr20, addr14, -3px^2, qx_, 2py, qy_ 
+    DUP6
     MULFP254
-    // stack:           (2py)qy, -3px^2, qx_, 2py, qy_ 
-    %mstore_bn254_pairing(20)
-    // stack:                    -3px^2, qx_, 2py, qy_ 
+    // stack:           (2py)qy, addr20, addr14, -3px^2, qx_, 2py, qy_ 
+    MSTORE_GENERAL
+    // stack:                   addr14, -3px^2, qx_, 2py, qy_
+    %add_const(1) SWAP2
+    // stack:                   qx_, -3px^2, addr15, 2py, qy_
     MULFP254
-    // stack:                   (-3px^2)*qx_, 2py, qy_ 
-    %mstore_bn254_pairing(15)
+    // stack:                   (-3px^2)*qx_, addr15, 2py, qy_ 
+    MSTORE_GENERAL
     // stack:                                 2py, qy_ 
     MULFP254
     // stack:                                (2py)*qy_ 
@@ -240,11 +248,11 @@ after_add:
 
 %macro cord
     // stack:                    p1x , p1y, p2x , p2y, qx, qx_, qy, qy_
-    DUP1  
-    DUP5  
+    DUP1
+    DUP5
     MULFP254
     // stack:           p2y*p1x, p1x , p1y, p2x , p2y, qx, qx_, qy, qy_
-    DUP3  
+    DUP3
     DUP5  
     MULFP254
     // stack: p1y*p2x , p2y*p1x, p1x , p1y, p2x , p2y, qx, qx_, qy, qy_
@@ -284,10 +292,34 @@ after_add:
 %endmacro
 
 %macro clear_line
-    %stack () -> (0, 0, 0, 0, 0)
-    %mstore_bn254_pairing(12)
-    %mstore_bn254_pairing(14)
-    %mstore_bn254_pairing(15)
-    %mstore_bn254_pairing(20)
-    %mstore_bn254_pairing(21)
+    PUSH 12
+    %create_bn254_pairing_address
+    // stack: addr12
+    DUP1 %add_const(2)
+    // stack: addr14, addr12
+    DUP1 %add_const(1)
+    // stack: addr15, addr14, addr12
+    DUP1 %add_const(5)
+    // stack: addr20, addr15, addr14, addr12
+    DUP1 %add_const(1)
+    // stack: addr21, addr20, addr15, addr14, addr12
+    %rep 5
+        PUSH 0 MSTORE_GENERAL
+    %endrep
+%endmacro
+
+
+%macro write_fp254_12_unit
+    // Write 0x10000000000000000000000 with MSTORE_32BYTES_12,
+    // effectively storing 1 at the initial offset, and 11 0s afterwards.
+
+    // stack: out
+    %create_bn254_pairing_address
+    // stack: addr
+    PUSH 0x10000000000000000000000
+    SWAP1
+    // stack: addr, 0x10000000000000000000000
+    MSTORE_32BYTES_12
+    POP
+    // stack:
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/msm.asm b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/msm.asm
index 1036228737..d5b97312ba 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/msm.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/msm.asm
@@ -42,31 +42,31 @@ bn_msm_loop_add_b_nonzero:
 
 %macro bn_mload_wnaf_a
     // stack: i
-    %mload_kernel(@SEGMENT_KERNEL_BN_WNAF_A)
+    %mload_current(@SEGMENT_BN_WNAF_A)
 %endmacro
 
 %macro bn_mload_wnaf_b
     // stack: i
-    %mload_kernel(@SEGMENT_KERNEL_BN_WNAF_B)
+    %mload_current(@SEGMENT_BN_WNAF_B)
 %endmacro
 
 %macro bn_mload_point_a
     // stack: w
     DUP1
-    %mload_kernel(@SEGMENT_KERNEL_BN_TABLE_Q)
+    %mload_current(@SEGMENT_BN_TABLE_Q)
     //stack: Gy, w
-    SWAP1 %decrement %mload_kernel(@SEGMENT_KERNEL_BN_TABLE_Q)
+    SWAP1 %decrement %mload_current(@SEGMENT_BN_TABLE_Q)
     //stack: Gx, Gy
 %endmacro
 
 %macro bn_mload_point_b
     // stack: w
     DUP1
-    %mload_kernel(@SEGMENT_KERNEL_BN_TABLE_Q)
-    PUSH @BN_BNEG_LOC %mload_kernel(@SEGMENT_KERNEL_BN_TABLE_Q)
+    %mload_current(@SEGMENT_BN_TABLE_Q)
+    PUSH @BN_BNEG_LOC %mload_current(@SEGMENT_BN_TABLE_Q)
     %stack (bneg, Gy, w) -> (@BN_BASE, Gy, bneg, bneg, Gy, w)
     SUB SWAP1 ISZERO MUL SWAP2 MUL ADD
-    SWAP1 %decrement %mload_kernel(@SEGMENT_KERNEL_BN_TABLE_Q)
+    SWAP1 %decrement %mload_current(@SEGMENT_BN_TABLE_Q)
     //stack: Gx, Gy
     PUSH @BN_GLV_BETA
     MULFP254
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/pairing.asm b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/pairing.asm
index c63c3b35e3..735d001aae 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/pairing.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/pairing.asm
@@ -83,9 +83,9 @@ bn_pairing_invalid_input:
 
 bn254_pairing_start:
     // stack:      0, k, inp, out,                   retdest
-    %stack (j, k, inp, out) -> (out, 1, k, inp, out, bn254_pairing_output_validation, out)
-    // stack: out, 1, k, inp, out, bn254_pairing_output_validation, out, retdest
-    %mstore_bn254_pairing
+    %stack (j, k, inp, out) -> (out, k, inp, out, bn254_pairing_output_validation, out)
+    // stack: out, k, inp, out, bn254_pairing_output_validation, out, retdest
+    %mstore_bn254_pairing_value(1)
     // stack:         k, inp, out, bn254_pairing_output_validation, out, retdest
 
 bn254_pairing_loop:
@@ -125,8 +125,9 @@ bn_skip_input:
 
 bn254_pairing_output_validation:
     // stack:        out, retdest
+    %create_bn254_pairing_address
     PUSH 1
-    // stack: check, out, retdest
+    // stack: check, out_addr, retdest
     %check_output_term
     %check_output_term(1)
     %check_output_term(2)
@@ -139,15 +140,15 @@ bn254_pairing_output_validation:
     %check_output_term(9)
     %check_output_term(10)
     %check_output_term(11)
-    // stack: check, out, retdest
-    %stack (check, out, retdest) -> (retdest, check)
+    // stack: check, out_addr, retdest
+    %stack (check, out_addr, retdest) -> (retdest, check)
     JUMP
 
 %macro check_output_term
     // stack:          check, out
     DUP2
     // stack:    out0, check, out
-    %mload_bn254_pairing
+    MLOAD_GENERAL
     // stack:      f0, check, out
     %eq_const(1)
     // stack:  check0, check, out
@@ -160,7 +161,7 @@ bn254_pairing_output_validation:
     DUP2
     %add_const($j)
     // stack:    outj, check, out
-    %mload_bn254_pairing
+    MLOAD_GENERAL
     // stack:      fj, check, out
     ISZERO
     // stack:  checkj, check, out
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/precomputation.asm b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/precomputation.asm
index a8c6ada926..5ee6685fe6 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/precomputation.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/precomputation.asm
@@ -1,5 +1,5 @@
 // Precompute a table of multiples of the BN254 point `Q = (Qx, Qy)`.
-// Let `(Qxi, Qyi) = i * Q`, then store in the `SEGMENT_KERNEL_BN_TABLE_Q` segment of memory the values
+// Let `(Qxi, Qyi) = i * Q`, then store in the `SEGMENT_BN_TABLE_Q` segment of memory the values
 // `i-1 => Qxi`, `i => Qyi if i < 16 else -Qy(32-i)` for `i in range(1, 32, 2)`.
 global bn_precompute_table:
     // stack: Qx, Qy, retdest
@@ -12,14 +12,14 @@ bn_precompute_table_loop:
     // stack i, Qx2, Qy2, Qx, Qy, retdest
     PUSH 1 DUP2 SUB
     %stack (im, i, Qx2, Qy2, Qx, Qy, retdest) -> (i, Qy, im, Qx, i, Qx2, Qy2, Qx, Qy, retdest)
-    %mstore_kernel(@SEGMENT_KERNEL_BN_TABLE_Q) %mstore_kernel(@SEGMENT_KERNEL_BN_TABLE_Q)
+    %mstore_current(@SEGMENT_BN_TABLE_Q) %mstore_current(@SEGMENT_BN_TABLE_Q)
     // stack: i, Qx2, Qy2, Qx, Qy, retdest
     DUP1 PUSH 32 SUB PUSH 1 DUP2 SUB
     // stack: 31-i, 32-i, i, Qx2, Qy2, Qx, Qy, retdest
     DUP7 PUSH @BN_BASE SUB
     // TODO: Could maybe avoid storing Qx a second time here, not sure if it would be more efficient.
     %stack (Qyy, iii, ii, i, Qx2, Qy2, Qx, Qy, retdest) -> (iii, Qx, ii, Qyy, i, Qx2, Qy2, Qx, Qy, retdest)
-    %mstore_kernel(@SEGMENT_KERNEL_BN_TABLE_Q) %mstore_kernel(@SEGMENT_KERNEL_BN_TABLE_Q)
+    %mstore_current(@SEGMENT_BN_TABLE_Q) %mstore_current(@SEGMENT_BN_TABLE_Q)
     // stack: i, Qx2, Qy2, Qx, Qy, retdest
     PUSH 2 ADD
     // stack: i+2, Qx2, Qy2, Qx, Qy, retdest
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/inverse.asm b/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/inverse.asm
index 947c972a32..7c7729057c 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/inverse.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/inverse.asm
@@ -1,4 +1,4 @@
-// Returns reverse order divison y/x, modulo N
+// Returns reverse order division y/x, modulo N
 %macro divr_fp254
     // stack: x   , y
     %inv_fp254
@@ -42,9 +42,12 @@ check_inv_fp254_12:
     // stack: unit?, retdest
     %assert_eq_unit_fp254_12
     // stack:        retdest
+    PUSH 60
+    %create_bn254_pairing_address
     PUSH 0
-    // stack:     0, retdest
-    %mstore_bn254_pairing(60)
+    // stack: 0, addr, retdest
+    MSTORE_GENERAL
+    // stack: retdest
     JUMP
 
 %macro prover_inv_fp254_12
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/util.asm b/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/util.asm
index 6dbddddcea..897404dbf2 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/util.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/util.asm
@@ -1,7 +1,7 @@
 // Load a single value from bn254 pairings memory.
 %macro mload_bn254_pairing
     // stack: offset
-    %mload_current(@SEGMENT_KERNEL_BN_PAIRING)
+    %mload_current(@SEGMENT_BN_PAIRING)
     // stack: value
 %endmacro
 
@@ -9,14 +9,32 @@
     // stack:
     PUSH $offset
     // stack: offset
-    %mload_current(@SEGMENT_KERNEL_BN_PAIRING)
+    %mload_current(@SEGMENT_BN_PAIRING)
     // stack: value
 %endmacro
 
 // Store a single value to bn254 pairings memory.
 %macro mstore_bn254_pairing
     // stack: offset, value
-    %mstore_current(@SEGMENT_KERNEL_BN_PAIRING)
+    %mstore_current(@SEGMENT_BN_PAIRING)
+    // stack:
+%endmacro
+
+// Build an address on the current context within SEGMENT_BN_PAIRING.
+%macro create_bn254_pairing_address
+    // stack: offset
+    PUSH @SEGMENT_BN_PAIRING
+    GET_CONTEXT
+    %build_address
+    // stack: addr
+%endmacro
+
+// Store a single value to bn254 pairings memory.
+%macro mstore_bn254_pairing_value(value)
+    // stack: offset
+    %create_bn254_pairing_address
+    PUSH $value
+    MSTORE_GENERAL
     // stack:
 %endmacro
 
@@ -24,7 +42,7 @@
     // stack: value
     PUSH $offset
     // stack: offset, value
-    %mstore_current(@SEGMENT_KERNEL_BN_PAIRING)
+    %mstore_current(@SEGMENT_BN_PAIRING)
     // stack:
 %endmacro
 
@@ -32,14 +50,15 @@
 
 %macro load_fp254_2
     // stack:       ptr
-    DUP1  
+    %create_bn254_pairing_address
+    DUP1
     %add_const(1)
-    // stack: ind1, ptr
-    %mload_bn254_pairing
-    // stack:   x1, ptr
+    // stack: addr1, addr
+    MLOAD_GENERAL
+    // stack:   x1, addr
     SWAP1
-    // stack: ind0, x1
-    %mload_bn254_pairing
+    // stack: addr0, x1
+    MLOAD_GENERAL
     // stack:   x0, x1
 %endmacro 
 
@@ -101,14 +120,14 @@
     // stack:      b,  a , b
     DUP2
     // stack:  a , b,  a , b
-    PUSH 9  
+    PUSH 9
     MULFP254
     // stack: 9a , b,  a , b
     SUBFP254
     // stack: 9a - b,  a , b
     SWAP2 
     // stack:  b , a, 9a - b
-    PUSH 9  
+    PUSH 9
     MULFP254
     // stack  9b , a, 9a - b
     ADDFP254
@@ -145,24 +164,25 @@
 
 %macro load_fp254_4
     // stack:                         ptr
-    DUP1  
+    %create_bn254_pairing_address
+    DUP1
     %add_const(2)
-    // stack:                   ind2, ptr
-    %mload_bn254_pairing
-    // stack:                     x2, ptr
-    DUP2  
+    // stack:                  addr2, addr
+    MLOAD_GENERAL
+    // stack:                     x2, addr
+    DUP2
     %add_const(1)
-    // stack:               ind1, x2, ptr
-    %mload_bn254_pairing
-    // stack:                 x1, x2, ptr
-    DUP3  
+    // stack:              addr1, x2, addr
+    MLOAD_GENERAL
+    // stack:                 x1, x2, addr
+    DUP3
     %add_const(3)
-    // stack:           ind3, x1, x2, ptr
-    %mload_bn254_pairing
-    // stack:             x3, x1, x2, ptr
+    // stack:          addr3, x1, x2, addr
+    MLOAD_GENERAL
+    // stack:             x3, x1, x2, addr
     SWAP3
-    // stack:            ind0, x1, x2, x3
-    %mload_bn254_pairing
+    // stack:           addr0, x1, x2, x3
+    MLOAD_GENERAL
     // stack:              x0, x1, x2, x3
 %endmacro
 
@@ -170,228 +190,177 @@
 
 %macro load_fp254_6
     // stack:                         ptr
-    DUP1  
+    %create_bn254_pairing_address
+    DUP1
     %add_const(4)
-    // stack:                   ind4, ptr
-    %mload_bn254_pairing
-    // stack:                     x4, ptr
-    DUP2  
+    // stack:                   addr4, addr
+    MLOAD_GENERAL
+    // stack:                     x4, addr
+    DUP2
     %add_const(3)
-    // stack:               ind3, x4, ptr
-    %mload_bn254_pairing
-    // stack:                 x3, x4, ptr
-    DUP3  
+    // stack:               addr3, x4, addr
+    MLOAD_GENERAL
+    // stack:                 x3, x4, addr
+    DUP3
     %add_const(2)
-    // stack:           ind2, x3, x4, ptr
-    %mload_bn254_pairing
-    // stack:             x2, x3, x4, ptr
-    DUP4  
+    // stack:           addr2, x3, x4, addr
+    MLOAD_GENERAL
+    // stack:             x2, x3, x4, addr
+    DUP4
     %add_const(1)
-    // stack:       ind1, x2, x3, x4, ptr
-    %mload_bn254_pairing
-    // stack:         x1, x2, x3, x4, ptr
-    DUP5  
+    // stack:       addr1, x2, x3, x4, addr
+    MLOAD_GENERAL
+    // stack:         x1, x2, x3, x4, addr
+    DUP5
     %add_const(5)
-    // stack:   ind5, x1, x2, x3, x4, ptr
-    %mload_bn254_pairing
-    // stack:     x5, x1, x2, x3, x4, ptr
+    // stack:   addr5, x1, x2, x3, x4, addr
+    MLOAD_GENERAL
+    // stack:     x5, x1, x2, x3, x4, addr
     SWAP5
-    // stack:   ind0, x1, x2, x3, x4, x5
-    %mload_bn254_pairing
+    // stack:   addr0, x1, x2, x3, x4, x5
+    MLOAD_GENERAL
     // stack:     x0, x1, x2, x3, x4, x5
 %endmacro
 
-// cost: 6 loads + 6 pushes + 5 adds = 6*4 + 6*1 + 5*2 = 40
 %macro load_fp254_6(ptr)
     // stack:
-    PUSH $ptr  
-    %add_const(5)
-    // stack:                     ind5
-    %mload_bn254_pairing
-    // stack:                       x5
-    PUSH $ptr  
-    %add_const(4)
-    // stack:                 ind4, x5
-    %mload_bn254_pairing
-    // stack:                   x4, x5
-    PUSH $ptr  
-    %add_const(3)
-    // stack:             ind3, x4, x5
-    %mload_bn254_pairing
-    // stack:               x3, x4, x5
-    PUSH $ptr  
-    %add_const(2)
-    // stack:         ind2, x3, x4, x5
-    %mload_bn254_pairing
-    // stack:           x2, x3, x4, x5
-    PUSH $ptr  
-    %add_const(1)
-    // stack:     ind1, x2, x3, x4, x5
-    %mload_bn254_pairing
-    // stack:       x1, x2, x3, x4, x5
     PUSH $ptr
-    // stack: ind0, x1, x2, x3, x4, x5
-    %mload_bn254_pairing
-    // stack:   x0, x1, x2, x3, x4, x5
+    %load_fp254_6
+    // stack: x0, x1, x2, x3, x4, x5
 %endmacro
 
-// cost: 6 stores + 6 swaps/dups + 5 adds = 6*4 + 6*1 + 5*2 = 40
 %macro store_fp254_6
     // stack:      ptr, x0, x1, x2, x3, x4 , x5
+    %create_bn254_pairing_address
     SWAP5
-    // stack:       x4, x0, x1, x2, x3, ptr, x5
-    DUP6  
+    // stack:       x4, x0, x1, x2, x3, addr, x5
+    DUP6
     %add_const(4)
-    // stack: ind4, x4, x0, x1, x2, x3, ptr, x5
-    %mstore_bn254_pairing
-    // stack:           x0, x1, x2, x3, ptr, x5
+    // stack: addr4, x4, x0, x1, x2, x3, addr, x5
+    %swap_mstore
+    // stack:           x0, x1, x2, x3, addr, x5
     DUP5
-    // stack:     ind0, x0, x1, x2, x3, ptr, x5
-    %mstore_bn254_pairing
-    // stack:               x1, x2, x3, ptr, x5
-    DUP4  
+    // stack:     addr0, x0, x1, x2, x3, addr, x5
+    %swap_mstore
+    // stack:               x1, x2, x3, addr, x5
+    DUP4
     %add_const(1)
-    // stack:         ind1, x1, x2, x3, ptr, x5
-    %mstore_bn254_pairing
-    // stack:                   x2, x3, ptr, x5
-    DUP3  
+    // stack:         addr1, x1, x2, x3, addr, x5
+    %swap_mstore
+    // stack:                   x2, x3, addr, x5
+    DUP3
     %add_const(2)
-    // stack:             ind2, x2, x3, ptr, x5
-    %mstore_bn254_pairing
-    // stack:                       x3, ptr, x5
-    DUP2  
+    // stack:             addr2, x2, x3, addr, x5
+    %swap_mstore
+    // stack:                       x3, addr, x5
+    DUP2
     %add_const(3)
-    // stack:                 ind3, x3, ptr, x5
-    %mstore_bn254_pairing
-    // stack:                           ptr, x5
+    // stack:                 addr3, x3, addr, x5
+    %swap_mstore
+    // stack:                           addr, x5
     %add_const(5)
-    // stack:                          ind5, x5
-    %mstore_bn254_pairing
+    // stack:                          addr5, x5
+    %swap_mstore
     // stack:
 %endmacro
 
-// cost: 6 stores + 7 swaps/dups + 5 adds + 6 doubles = 6*4 + 7*1 + 5*2 + 6*2 = 53
 %macro store_fp254_6_double
     // stack:        ptr, x0, x1, x2, x3, x4, x5
+    %create_bn254_pairing_address
     SWAP6
-    // stack:         x5, x0, x1, x2, x3, x4, ptr
-    PUSH 2  
+    // stack:         x5, x0, x1, x2, x3, x4, addr
+    PUSH 2
     MULFP254
-    // stack:       2*x5, x0, x1, x2, x3, x4, ptr
-    DUP7  
+    // stack:       2*x5, x0, x1, x2, x3, x4, addr
+    DUP7
     %add_const(5)
-    // stack: ind5, 2*x5, x0, x1, x2, x3, x4, ptr
-    %mstore_bn254_pairing
-    // stack:             x0, x1, x2, x3, x4, ptr
-    PUSH 2  
+    // stack: addr5, 2*x5, x0, x1, x2, x3, x4, addr
+    %swap_mstore
+    // stack:             x0, x1, x2, x3, x4, addr
+    PUSH 2
     MULFP254
-    // stack:           2*x0, x1, x2, x3, x4, ptr
+    // stack:           2*x0, x1, x2, x3, x4, addr
     DUP6
-    // stack:     ind0, 2*x0, x1, x2, x3, x4, ptr
-    %mstore_bn254_pairing
-    // stack:                 x1, x2, x3, x4, ptr
-    PUSH 2  
+    // stack:     addr0, 2*x0, x1, x2, x3, x4, addr
+    %swap_mstore
+    // stack:                 x1, x2, x3, x4, addr
+    PUSH 2
     MULFP254
-    // stack:               2*x1, x2, x3, x4, ptr
-    DUP5  
+    // stack:               2*x1, x2, x3, x4, addr
+    DUP5
     %add_const(1)
-    // stack:         ind1, 2*x1, x2, x3, x4, ptr
-    %mstore_bn254_pairing
-    // stack:                     x2, x3, x4, ptr
-    PUSH 2  
+    // stack:         addr1, 2*x1, x2, x3, x4, addr
+    %swap_mstore
+    // stack:                     x2, x3, x4, addr
+    PUSH 2
     MULFP254
-    // stack:                   2*x2, x3, x4, ptr
-    DUP4  
+    // stack:                   2*x2, x3, x4, addr
+    DUP4
     %add_const(2)
-    // stack:             ind2, 2*x2, x3, x4, ptr
-    %mstore_bn254_pairing
-    // stack:                         x3, x4, ptr
+    // stack:             addr2, 2*x2, x3, x4, addr
+    %swap_mstore
+    // stack:                         x3, x4, addr
     PUSH 2 
     MULFP254
-    // stack:                       2*x3, x4, ptr
-    DUP3  
+    // stack:                       2*x3, x4, addr
+    DUP3
     %add_const(3)
-    // stack:                 ind3, 2*x3, x4, ptr
-    %mstore_bn254_pairing
-    // stack:                             x4, ptr
-    PUSH 2  
+    // stack:                 addr3, 2*x3, x4, addr
+    %swap_mstore
+    // stack:                             x4, addr
+    PUSH 2
     MULFP254
-    // stack:                           2*x4, ptr
+    // stack:                           2*x4, addr
     SWAP1
-    // stack:                           ptr, 2*x4
+    // stack:                           addr, 2*x4
     %add_const(4)
-    // stack:                          ind4, 2*x4
-    %mstore_bn254_pairing
+    // stack:                          addr4, 2*x4
+    %swap_mstore
     // stack:
 %endmacro
 
-// cost: 6 stores + 6 pushes + 5 adds = 6*4 + 6*1 + 5*2 = 40
 %macro store_fp254_6(ptr)
-    // stack:       x0, x1, x2, x3, x4, x5
+    // stack: x0, x1, x2, x3, x4, x5
     PUSH $ptr
-    // stack: ind0, x0, x1, x2, x3, x4, x5
-    %mstore_bn254_pairing
-    // stack:           x1, x2, x3, x4, x5
-    PUSH $ptr  
-    %add_const(1)
-    // stack:     ind1, x1, x2, x3, x4, x5
-    %mstore_bn254_pairing
-    // stack:               x2, x3, x4, x5
-    PUSH $ptr  
-    %add_const(2)
-    // stack:         ind2, x2, x3, x4, x5
-    %mstore_bn254_pairing
-    // stack:                   x3, x4, x5
-    PUSH $ptr  
-    %add_const(3)
-    // stack:             ind3, x3, x4, x5
-    %mstore_bn254_pairing
-    // stack:                       x4, x5
-    PUSH $ptr  
-    %add_const(4)
-    // stack:                 ind4, x4, x5
-    %mstore_bn254_pairing
-    // stack:                           x5
-    PUSH $ptr  
-    %add_const(5)
-    // stack:                     ind5, x5
-    %mstore_bn254_pairing
+    %store_fp254_6
     // stack:
 %endmacro
 
-// cost: store (40) + i9 (9) = 49
 %macro store_fp254_6_sh(ptr)
     // stack:       x0, x1, x2, x3, x4, x5
-    PUSH $ptr  
+    PUSH $ptr
+    %create_bn254_pairing_address
+    // stack: addr, x0, x1, x2, x3, x4, x5
     %add_const(2)
-    // stack: ind2, x0, x1, x2, x3, x4, x5
-    %mstore_bn254_pairing
-    // stack:           x1, x2, x3, x4, x5
-    PUSH $ptr  
-    %add_const(3)
-    // stack:     ind3, x1, x2, x3, x4, x5
-    %mstore_bn254_pairing
-    // stack:               x2, x3, x4, x5
-    PUSH $ptr  
-    %add_const(4)
-    // stack:         ind4, x2, x3, x4, x5
-    %mstore_bn254_pairing
-    // stack:                   x3, x4, x5
-    PUSH $ptr  
-    %add_const(5)
-    // stack:             ind5, x3, x4, x5
-    %mstore_bn254_pairing
+    DUP1
+    // stack: addr2, addr2, x0, x1, x2, x3, x4, x5
+    SWAP2 MSTORE_GENERAL
+    // stack:    addr2, x1, x2, x3, x4, x5
+    %add_const(1)
+    DUP1
+    // stack: addr3, addr3, x1, x2, x3, x4, x5
+    SWAP2 MSTORE_GENERAL
+    // stack:        addr3, x2, x3, x4, x5
+    %add_const(1)
+    DUP1
+    // stack: addr4, addr4, x2, x3, x4, x5
+    SWAP2 MSTORE_GENERAL
+    // stack:            addr4, x3, x4, x5
+    %add_const(1)
+    // stack:            addr5, x3, x4, x5
+    %swap_mstore
     // stack:                       x4, x5
     %i9
     // stack:                       y5, y4
     PUSH $ptr  
+    %create_bn254_pairing_address
+    DUP1
     %add_const(1)
-    // stack:                 ind1, y5, y4
-    %mstore_bn254_pairing
-    // stack:                           y4
-    PUSH $ptr
-    // stack:                     ind0, y4
-    %mstore_bn254_pairing
+    // stack:          addr1, addr, y5, y4
+    SWAP3
+    MSTORE_GENERAL
+    // stack:                    y5, addr1
+    MSTORE_GENERAL
     // stack:
 %endmacro
 
@@ -575,10 +544,10 @@
     MULFP254
     SWAP3 
     // stack: c , f0,     f1, c * f2, c * f3, c *f 4, c * f5
-    SWAP2  
-    DUP3  
+    SWAP2
+    DUP3
     MULFP254
-    SWAP2  
+    SWAP2
     // stack: c , f0, c * f1, c * f2, c * f3, c * f4, c * f5
     MULFP254
     // stack: c * f0, c * f1, c * f2, c * f3, c * f4, c * f5
@@ -864,264 +833,268 @@
 
 %macro load_fp254_12
     // stack:                                                          ptr
-    DUP1  
+    %create_bn254_pairing_address
+    DUP1
     %add_const(10)
-    // stack:                                                   ind10, ptr
-    %mload_bn254_pairing
-    // stack:                                                     x10, ptr
-    DUP2  
+    // stack:                                                   addr10, addr
+    MLOAD_GENERAL
+    // stack:                                                     x10, addr
+    DUP2
     %add_const(9)
-    // stack:                                              ind09, x10, ptr
-    %mload_bn254_pairing
-    // stack:                                                x09, x10, ptr
-    DUP3  
+    // stack:                                              addr09, x10, addr
+    MLOAD_GENERAL
+    // stack:                                                x09, x10, addr
+    DUP3
     %add_const(8)
-    // stack:                                         ind08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:                                           x08, x09, x10, ptr
-    DUP4  
+    // stack:                                         addr08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:                                           x08, x09, x10, addr
+    DUP4
     %add_const(7)
-    // stack:                                    ind07, x08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:                                      x07, x08, x09, x10, ptr
-    DUP5  
+    // stack:                                    addr07, x08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:                                      x07, x08, x09, x10, addr
+    DUP5
     %add_const(6)
-    // stack:                               ind06, x07, x08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:                                 x06, x07, x08, x09, x10, ptr
-    DUP6  
+    // stack:                               addr06, x07, x08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:                                 x06, x07, x08, x09, x10, addr
+    DUP6
     %add_const(5)
-    // stack:                          ind05, x06, x07, x08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:                            x05, x06, x07, x08, x09, x10, ptr
-    DUP7  
+    // stack:                          addr05, x06, x07, x08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:                            x05, x06, x07, x08, x09, x10, addr
+    DUP7
     %add_const(4)
-    // stack:                     ind04, x05, x06, x07, x08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:                       x04, x05, x06, x07, x08, x09, x10, ptr
-    DUP8  
+    // stack:                     addr04, x05, x06, x07, x08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:                       x04, x05, x06, x07, x08, x09, x10, addr
+    DUP8
     %add_const(3)
-    // stack:                ind03, x04, x05, x06, x07, x08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:                  x03, x04, x05, x06, x07, x08, x09, x10, ptr
-    DUP9  
+    // stack:                addr03, x04, x05, x06, x07, x08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:                  x03, x04, x05, x06, x07, x08, x09, x10, addr
+    DUP9
     %add_const(2)
-    // stack:           ind02, x03, x04, x05, x06, x07, x08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:             x02, x03, x04, x05, x06, x07, x08, x09, x10, ptr
-    DUP10  
+    // stack:           addr02, x03, x04, x05, x06, x07, x08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:             x02, x03, x04, x05, x06, x07, x08, x09, x10, addr
+    DUP10
     %add_const(1)
-    // stack:      ind01, x02, x03, x04, x05, x06, x07, x08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:        x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, ptr
-    DUP11  
+    // stack:      addr01, x02, x03, x04, x05, x06, x07, x08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:        x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, addr
+    DUP11
     %add_const(11)
-    // stack: ind11, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, ptr
-    %mload_bn254_pairing
-    // stack:   x11, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, ptr
+    // stack: addr11, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, addr
+    MLOAD_GENERAL
+    // stack:   x11, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, addr
     SWAP11
-    // stack: ind00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11
-    %mload_bn254_pairing
+    // stack: addr00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11
+    MLOAD_GENERAL
     // stack:   x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11
 %endmacro
 
 %macro store_fp254_12
     // stack:        ptr, x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11
+    %create_bn254_pairing_address
     SWAP11
-    // stack:        x10, x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, ptr, x11
-    DUP12  
+    // stack:        x10, x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, addr, x11
+    DUP12
     %add_const(10)
-    // stack: ind10, x10, x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:             x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, ptr, x11
+    // stack: addr10, x10, x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:             x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, addr, x11
     DUP11
-    // stack:      ind00, x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                  x01, x02, x03, x04, x05, x06, x07, x08, x09, ptr, x11
-    DUP10  
+    // stack:      addr00, x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                  x01, x02, x03, x04, x05, x06, x07, x08, x09, addr, x11
+    DUP10
     %add_const(01)
-    // stack:           ind01, x01, x02, x03, x04, x05, x06, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                       x02, x03, x04, x05, x06, x07, x08, x09, ptr, x11
+    // stack:           addr01, x01, x02, x03, x04, x05, x06, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                       x02, x03, x04, x05, x06, x07, x08, x09, addr, x11
     DUP9   
     %add_const(02)
-    // stack:                ind02, x02, x03, x04, x05, x06, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                            x03, x04, x05, x06, x07, x08, x09, ptr, x11
+    // stack:                addr02, x02, x03, x04, x05, x06, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                            x03, x04, x05, x06, x07, x08, x09, addr, x11
     DUP8   
     %add_const(03)
-    // stack:                     ind03, x03, x04, x05, x06, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                                 x04, x05, x06, x07, x08, x09, ptr, x11
+    // stack:                     addr03, x03, x04, x05, x06, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                                 x04, x05, x06, x07, x08, x09, addr, x11
     DUP7   
     %add_const(04)
-    // stack:                          ind04, x04, x05, x06, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                                      x05, x06, x07, x08, x09, ptr, x11
+    // stack:                          addr04, x04, x05, x06, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                                      x05, x06, x07, x08, x09, addr, x11
     DUP6   
     %add_const(05)
-    // stack:                               ind05, x05, x06, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                                           x06, x07, x08, x09, ptr, x11
+    // stack:                               addr05, x05, x06, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                                           x06, x07, x08, x09, addr, x11
     DUP5   
     %add_const(06)
-    // stack:                                    ind06, x06, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                                                x07, x08, x09, ptr, x11
+    // stack:                                    addr06, x06, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                                                x07, x08, x09, addr, x11
     DUP4   
     %add_const(07)
-    // stack:                                         ind07, x07, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                                                     x08, x09, ptr, x11
+    // stack:                                         addr07, x07, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                                                     x08, x09, addr, x11
     DUP3   
     %add_const(08)
-    // stack:                                              ind08, x08, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                                                          x09, ptr, x11
+    // stack:                                              addr08, x08, x09, addr, x11
+    %swap_mstore
+    // stack:                                                          x09, addr, x11
     DUP2   
     %add_const(09)
-    // stack:                                                   ind09, x09, ptr, x11
-    %mstore_bn254_pairing
-    // stack:                                                               ptr, x11
+    // stack:                                                   addr09, x09, addr, x11
+    %swap_mstore
+    // stack:                                                               addr, x11
     %add_const(11)
-    // stack:                                                             ind11, x11
-    %mstore_bn254_pairing
+    // stack:                                                             addr11, x11
+    %swap_mstore
     // stack:                                                            
 %endmacro
 
 /// moves fp254_12 from src..src+12 to dest..dest+12
-/// these should not overlap. leaves dest on stack
+/// these should not overlap. leaves scaled DEST on stack
 %macro move_fp254_12
     // stack:              src, dest
-    DUP1  
-    // stack:       ind00, src, dest
-    %mload_bn254_pairing
-    // stack:         x00, src, dest
+    PUSH @SEGMENT_BN_PAIRING
+    GET_CONTEXT
+    %build_address_no_offset
+    DUP1
+    // stack: base_addr, base_addr, src, dest
+    SWAP3 ADD
+    // stack: DEST, src, base_addr
+    SWAP2 ADD
+    // stack:              SRC, DEST
+    DUP1
+    // stack:       addr00, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x00, SRC, DEST
     DUP3
-    // stack: ind00', x00, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr00', x00, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(1)
-    // stack:       ind01, src, dest
-    %mload_bn254_pairing
-    // stack:         x01, src, dest
-    DUP3  
+    // stack:       addr01, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x01, SRC, DEST
+    DUP3
     %add_const(1)
-    // stack: ind01', x01, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr01', x01, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(2)
-    // stack:       ind02, src, dest
-    %mload_bn254_pairing
-    // stack:         x02, src, dest
-    DUP3  
+    // stack:       addr02, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x02, SRC, DEST
+    DUP3
     %add_const(2)
-    // stack: ind02', x02, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr02', x02, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(3)
-    // stack:       ind03, src, dest
-    %mload_bn254_pairing
-    // stack:         x03, src, dest
-    DUP3  
+    // stack:       addr03, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x03, SRC, DEST
+    DUP3
     %add_const(3)
-    // stack: ind03', x03, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr03', x03, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(4)
-    // stack:       ind04, src, dest
-    %mload_bn254_pairing
-    // stack:         x04, src, dest
+    // stack:       addr04, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x04, SRC, DEST
     DUP3 
     %add_const(4)
-    // stack: ind04', x04, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr04', x04, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(5)
-    // stack:       ind05, src, dest
-    %mload_bn254_pairing
-    // stack:         x05, src, dest
-    DUP3  
+    // stack:       addr05, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x05, SRC, DEST
+    DUP3
     %add_const(5)
-    // stack: ind05', x05, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr05', x05, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(6)
-    // stack:       ind06, src, dest
-    %mload_bn254_pairing
-    // stack:         x06, src, dest
-    DUP3  
+    // stack:       addr06, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x06, SRC, DEST
+    DUP3
     %add_const(6)
-    // stack: ind06', x06, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr06', x06, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(7)
-    // stack:       ind07, src, dest
-    %mload_bn254_pairing
-    // stack:         x07, src, dest
-    DUP3  
+    // stack:       addr07, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x07, SRC, DEST
+    DUP3
     %add_const(7)
-    // stack: ind07', x07, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr07', x07, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(8)
-    // stack:       ind08, src, dest
-    %mload_bn254_pairing
-    // stack:         x08, src, dest
-    DUP3  
+    // stack:       addr08, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x08, SRC, DEST
+    DUP3
     %add_const(8)
-    // stack: ind08', x08, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
+    // stack: addr08', x08, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
     DUP1 
     %add_const(9)
-    // stack:       ind09, src, dest
-    %mload_bn254_pairing
-    // stack:         x09, src, dest
-    DUP3  
+    // stack:       addr09, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x09, SRC, DEST
+    DUP3
     %add_const(9)
-    // stack: ind09', x09, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
-    DUP1  
+    // stack: addr09', x09, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
+    DUP1
     %add_const(10)
-    // stack:       ind10, src, dest
-    %mload_bn254_pairing
-    // stack:         x10, src, dest
-    DUP3  
+    // stack:       addr10, SRC, DEST
+    MLOAD_GENERAL
+    // stack:         x10, SRC, DEST
+    DUP3
     %add_const(10)
-    // stack: ind10', x10, src, dest
-    %mstore_bn254_pairing
-    // stack:              src, dest
+    // stack: addr10', x10, SRC, DEST
+    %swap_mstore
+    // stack:              SRC, DEST
     %add_const(11)
-    // stack:            ind11, dest
-    %mload_bn254_pairing
-    // stack:              x11, dest
-    DUP2  
+    // stack:            addr11, DEST
+    MLOAD_GENERAL
+    // stack:              x11, DEST
+    DUP2
     %add_const(11)
-    // stack:      ind11', x11, dest
-    %mstore_bn254_pairing
+    // stack:      addr11', x11, DEST
+    %swap_mstore
 %endmacro
 
 %macro assert_eq_unit_fp254_12
     %assert_eq_const(1)
-    %assert_zero
-    %assert_zero
-    %assert_zero
-    %assert_zero
-    %assert_zero
-    %assert_zero
-    %assert_zero
-    %assert_zero
-    %assert_zero
-    %assert_zero
+    %rep 10
+        OR
+    %endrep
     %assert_zero
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm
index c84536d807..c11031004f 100644
--- a/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm
+++ b/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm
@@ -87,9 +87,9 @@ ecdsa_after_precompute_loop:
     %mul_const(2) ADD %mul_const(2) ADD %mul_const(2) ADD
     %stack (index, i, accx, accy, a0, a1, b0, b1, retdest) -> (index, index, i, accx, accy, a0, a1, b0, b1, retdest)
     %mul_const(2) %add_const(1)
-    %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mload_current(@SEGMENT_ECDSA_TABLE)
     SWAP1 %mul_const(2)
-    %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mload_current(@SEGMENT_ECDSA_TABLE)
     %stack (Px, Py, i, accx, accy, a0, a1, b0, b1, retdest) -> (Px, Py, accx, accy, ecdsa_after_precompute_loop_contd, i, a0, a1, b0, b1, retdest)
     %jump(secp_add_valid_points)
 ecdsa_after_precompute_loop_contd:
@@ -97,8 +97,9 @@ ecdsa_after_precompute_loop_contd:
     ISZERO %jumpi(ecdsa_after_precompute_loop_end)
     %jump(secp_double)
 ecdsa_after_precompute_loop_contd2:
-    %stack (accx, accy, i, a0, a1, b0, b1, retdest) -> (i, accx, accy, a0, a1, b0, b1, retdest)
-    %decrement %jump(ecdsa_after_precompute_loop)
+    %stack (accx, accy, i, a0, a1, b0, b1, retdest) -> (i, 1, accx, accy, a0, a1, b0, b1, retdest)
+    SUB // i - 1
+    %jump(ecdsa_after_precompute_loop)
 ecdsa_after_precompute_loop_end:
     // Check that the public key is not the point at infinity. See https://github.com/ethereum/eth-keys/pull/76 for discussion.
     DUP2 DUP2 ISZERO SWAP1 ISZERO MUL %jumpi(pk_is_infinity)
diff --git a/evm/src/cpu/kernel/asm/curve/secp256k1/precomputation.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/precomputation.asm
index 3cea031556..b6bed1b0a9 100644
--- a/evm/src/cpu/kernel/asm/curve/secp256k1/precomputation.asm
+++ b/evm/src/cpu/kernel/asm/curve/secp256k1/precomputation.asm
@@ -1,27 +1,27 @@
 // Initial stack: Gneg, Qneg, Qx, Qy, retdest
-// Compute a*G ± b*phi(G) + c*Q ± d*phi(Q) for a,b,c,d in {0,1}^4 and store its x-coordinate at location `2*(8a+4b+2c+d)` and its y-coordinate at location `2*(8a+4b+2c+d)+1` in the SEGMENT_KERNEL_ECDSA_TABLE segment.
+// Compute a*G ± b*phi(G) + c*Q ± d*phi(Q) for a,b,c,d in {0,1}^4 and store its x-coordinate at location `2*(8a+4b+2c+d)` and its y-coordinate at location `2*(8a+4b+2c+d)+1` in the SEGMENT_ECDSA_TABLE segment.
 global secp_precompute_table:
     // First store G, ± phi(G), G ± phi(G)
     // Use Gneg for the ±, e.g., ±phi(G) is computed as `Gneg * (-phi(G)) + (1-Gneg)*phi(G)` (note only the y-coordinate needs to be filtered).
     // stack: Gneg, Qneg, Qx, Qy, retdest
     PUSH 32670510020758816978083085130507043184471273380659243275938904335757337482424 PUSH 17 PUSH 55066263022277343669578718895168534326250603453777594175500187360389116729240 PUSH 16
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
 
     DUP1 DUP1 %mul_const(32670510020758816978083085130507043184471273380659243275938904335757337482424) SWAP1 PUSH 1 SUB %mul_const(83121579216557378445487899878180864668798711284981320763518679672151497189239) ADD
     PUSH 9 PUSH 85340279321737800624759429340272274763154997815782306132637707972559913914315  PUSH 8
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
 
     DUP1 DUP1 %mul_const(83121579216557378445487899878180864668798711284981320763518679672151497189239) SWAP1 PUSH 1 SUB %mul_const(100652675408719987021357910538015346127426077519185866739835120963490438734674) ADD
     PUSH 25
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE)
 
     DUP1 %mul_const(91177636130617246552803821781935006617134368061721227770777272682868638699771) SWAP1 PUSH 1 SUB %mul_const(66837770201594535779099350687042404727408598709762866365333192677982385899440) ADD
     PUSH 24
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE)
 
     // Then store Q, ±phi(Q), Q ± phi(Q)
     %stack (Qneg, Qx, Qy, retdest) -> (4, Qx, 5, Qy, Qx, @SECP_BASE, Qneg, Qx, Qy, retdest)
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
     // stack: Qx, @SECP_BASE, Qx, Qy, retdest
     PUSH @SECP_GLV_BETA MULMOD
     %stack (betaQx, Qneg, Qx, Qy, retdest) -> (Qneg, Qy, Qneg, betaQx, Qx, Qy, retdest)
@@ -29,42 +29,42 @@ global secp_precompute_table:
     // stack: 1-Qneg, Qneg*Qy, betaQx, Qx, Qy, retdest
     DUP5 PUSH @SECP_BASE SUB MUL ADD
     %stack (selectQy, betaQx, Qx, Qy, retdest) -> (2, betaQx, 3, selectQy, betaQx, selectQy, Qx, Qy, precompute_table_contd, retdest)
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
     %jump(secp_add_valid_points_no_edge_case)
 precompute_table_contd:
     %stack (x, y, retdest) -> (6, x, 7, y, retdest)
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
     PUSH 2
 // Use a loop to store a*G ± b*phi(G) + c*Q ± d*phi(Q) for a,b,c,d in {0,1}^4.
 precompute_table_loop:
     // stack: i, retdest
-    DUP1 %increment %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    DUP1 %increment %mload_current(@SEGMENT_ECDSA_TABLE)
     %stack (y, i, retdest) -> (i, y, i, retdest)
-    %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mload_current(@SEGMENT_ECDSA_TABLE)
     PUSH precompute_table_loop_contd
     DUP3 DUP3
-    PUSH 9 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
-    PUSH 8 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    PUSH 9 %mload_current(@SEGMENT_ECDSA_TABLE)
+    PUSH 8 %mload_current(@SEGMENT_ECDSA_TABLE)
     // stack: Gx, Gy, x, y, precompute_table_loop_contd, x, y, i, retdest
     %jump(secp_add_valid_points)
 precompute_table_loop_contd:
     %stack (Rx, Ry, x, y, i, retdest) -> (i, 8, Rx, i, 9, Ry, x, y, i, retdest)
-    ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    ADD %mstore_current(@SEGMENT_ECDSA_TABLE) ADD %mstore_current(@SEGMENT_ECDSA_TABLE)
     DUP2 DUP2
-    PUSH 17 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
-    PUSH 16 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    PUSH 17 %mload_current(@SEGMENT_ECDSA_TABLE)
+    PUSH 16 %mload_current(@SEGMENT_ECDSA_TABLE)
     %stack (Gx, Gy, x, y, x, y, i, retdest) -> (Gx, Gy, x, y, precompute_table_loop_contd2, x, y, i, retdest)
     %jump(secp_add_valid_points)
 precompute_table_loop_contd2:
     %stack (Rx, Ry, x, y, i, retdest) -> (i, 16, Rx, i, 17, Ry, x, y, i, retdest)
-    ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
-    PUSH 25 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
-    PUSH 24 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    ADD %mstore_current(@SEGMENT_ECDSA_TABLE) ADD %mstore_current(@SEGMENT_ECDSA_TABLE)
+    PUSH 25 %mload_current(@SEGMENT_ECDSA_TABLE)
+    PUSH 24 %mload_current(@SEGMENT_ECDSA_TABLE)
     %stack (Gx, Gy, x, y, i, retdest) -> (Gx, Gy, x, y, precompute_table_loop_contd3, i, retdest)
     %jump(secp_add_valid_points)
 precompute_table_loop_contd3:
     %stack (Rx, Ry, i, retdest) -> (i, 24, Rx, i, 25, Ry, i, retdest)
-    ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    ADD %mstore_current(@SEGMENT_ECDSA_TABLE) ADD %mstore_current(@SEGMENT_ECDSA_TABLE)
     %add_const(2)
     DUP1 %eq_const(8) %jumpi(precompute_table_end)
     %jump(precompute_table_loop)
diff --git a/evm/src/cpu/kernel/asm/curve/wnaf.asm b/evm/src/cpu/kernel/asm/curve/wnaf.asm
index 555c9c8465..f554bc649d 100644
--- a/evm/src/cpu/kernel/asm/curve/wnaf.asm
+++ b/evm/src/cpu/kernel/asm/curve/wnaf.asm
@@ -34,7 +34,12 @@ wnaf_loop_contd:
     DUP2 SWAP1 SUB
     %stack (n, m, segment, o, retdest) -> (129, o, m, o, segment, n, retdest)
     SUB
-    %stack (i, m, o, segment, n, retdest) -> (0, segment, i, m, o, segment, n, retdest)
+    // stack:  i, m, o, segment, n, retdest
+    DUP4
+    GET_CONTEXT
+    %build_address
+    // stack:  addr, m, o, segment, n, retdest
+    SWAP1
     MSTORE_GENERAL
     // stack: o, segment, n, retdest
     DUP3 ISZERO %jumpi(wnaf_end)
diff --git a/evm/src/cpu/kernel/asm/exp.asm b/evm/src/cpu/kernel/asm/exp.asm
index 5dd6736655..4b798e841c 100644
--- a/evm/src/cpu/kernel/asm/exp.asm
+++ b/evm/src/cpu/kernel/asm/exp.asm
@@ -86,7 +86,7 @@ sys_exp_gas_loop_enter:
     // stack: e >> shift, shift, x, e, return_info
     %jumpi(sys_exp_gas_loop)
     // stack: shift_bits, x, e, return_info
-    %div_const(8)
+    %shr_const(3)
     // stack: byte_size_of_e := shift_bits / 8, x, e, return_info
     %mul_const(@GAS_EXPBYTE)
     %add_const(@GAS_EXP)
diff --git a/evm/src/cpu/kernel/asm/hash/blake2/addresses.asm b/evm/src/cpu/kernel/asm/hash/blake2/addresses.asm
index 06b93f9ea9..3244cfa1f2 100644
--- a/evm/src/cpu/kernel/asm/hash/blake2/addresses.asm
+++ b/evm/src/cpu/kernel/asm/hash/blake2/addresses.asm
@@ -1,12 +1,16 @@
 // Address where the working version of the hash value is stored.
+// It is ready to be used, i.e. already containing the current context
+// and SEGMENT_KERNEL_GENERAL.
 %macro blake2_hash_value_addr
-    PUSH 0
-    // stack: 0
-    %mload_current_general
-    // stack: num_blocks
+    %build_current_general_address_no_offset
+    DUP1
+    MLOAD_GENERAL
+    // stack: num_blocks, addr
     %block_size
     %add_const(2)
-    // stack: num_bytes+2
+    // stack: num_bytes+2, addr
+    ADD
+    // stack: addr
 %endmacro
 
 // Address where the working version of the compression internal state is stored.
diff --git a/evm/src/cpu/kernel/asm/hash/blake2/blake2_f.asm b/evm/src/cpu/kernel/asm/hash/blake2/blake2_f.asm
index 95a4749e0f..d1a4a2ab64 100644
--- a/evm/src/cpu/kernel/asm/hash/blake2/blake2_f.asm
+++ b/evm/src/cpu/kernel/asm/hash/blake2/blake2_f.asm
@@ -6,9 +6,9 @@ global blake2_f:
     // stack: addr, rounds, h0...h7, m0...m15, t0, t1, flag, retdest
     %rep 8
         // stack: addr, rounds, h_i, ...
-        %stack (addr, rounds, h_i) -> (addr, h_i, addr, rounds)
-        // stack: addr, h_i, addr, rounds, ...
-        %mstore_current_general
+        %stack (addr, rounds, h_i) -> (h_i, addr, addr, rounds)
+        // stack: h_i, addr, addr, rounds, ...
+        MSTORE_GENERAL
         %increment
     %endrep
 
@@ -21,9 +21,9 @@ global blake2_f:
     // stack: message_addr, rounds, m0...m15, t0, t1, flag, retdest
     %rep 16
         // stack: message_addr, rounds, m_i, ...
-        %stack (message_addr, rounds, m_i) -> (message_addr, m_i, message_addr, rounds)
-        // stack: message_addr, m_i, message_addr, rounds, ...
-        %mstore_current_general
+        %stack (message_addr, rounds, m_i) -> (m_i, message_addr, message_addr, rounds)
+        // stack: m_i, message_addr, message_addr, rounds, ...
+        MSTORE_GENERAL
         %increment
     %endrep
 
@@ -37,7 +37,7 @@ global blake2_f:
         // stack: addr, ...
         DUP1
         // stack: addr, addr, ...
-        %mload_current_general
+        MLOAD_GENERAL
         // stack: val, addr, ...
         SWAP1
         // stack: addr, val, ...
@@ -53,31 +53,30 @@ global blake2_f:
 
     // First eight words of the internal state: current hash value h_0, ..., h_7.
     %rep 8
-        SWAP1
-        DUP2
-        %mstore_current_general
+        DUP1
+        SWAP2
+        MSTORE_GENERAL
         %increment
     %endrep
     // stack: start + 8, rounds, t0, t1, flag, retdest
 
     // Next four values of the internal state: first four IV values.
     PUSH 0
-    // stack: 0, start + 8, rounds, t0, t1, flag, retdest
+    // stack: 0, addr, rounds, t0, t1, flag, retdest
     %rep 4
-        // stack: i, loc, ...
-        DUP1
-        // stack: i, i, loc, ...
+        // stack: i, addr, ...
+        DUP2
+        DUP2
+        // stack: i, addr, i, addr, ...
         %blake2_iv
-        // stack: IV_i, i, loc, ...
-        DUP3
-        // stack: loc, IV_i, i, loc, ...
-        %mstore_current_general
-        // stack: i, loc, ...
+        // stack: IV_i, addr, i, addr, ...
+        MSTORE_GENERAL
+        // stack: i, addr, ...
         %increment
         SWAP1
         %increment
         SWAP1
-        // stack: i + 1, loc + 1,...
+        // stack: i + 1, addr + 1,...
     %endrep
     // stack: 4, start + 12, rounds, t0, t1, flag, retdest
     POP
@@ -92,29 +91,28 @@ global blake2_f:
     // Last four values of the internal state: last four IV values, XOR'd with
     // the values (t0, t1, invert_if_flag, 0).
     %rep 4
-        // stack: i, loc, val, next_val,...
-        DUP1
-        // stack: i, i, loc, val, next_val,...
+        // stack: i, addr, val, next_val,...
+        DUP2
+        DUP2
+        // stack: i, addr, i, addr, val, next_val,...
         %blake2_iv
-        // stack: IV_i, i, loc, val, next_val,...
-        DUP4
-        // stack: val, IV_i, i, loc, val, next_val,...
+        // stack: IV_i, addr, i, addr, val, next_val,...
+        DUP5
+        // stack: val, IV_i, addr, i, addr, val, next_val,...
         XOR
-        // stack: val ^ IV_i, i, loc, val, next_val,...
-        DUP3
-        // stack: loc, val ^ IV_i, i, loc, val, next_val,...
-        %mstore_current_general
-        // stack: i, loc, val, next_val,...
+        // stack: val ^ IV_i, addr, i, addr, val, next_val,...
+        MSTORE_GENERAL
+        // stack: i, addr, val, next_val,...
         %increment
-        // stack: i + 1, loc, val, next_val,...
+        // stack: i + 1, addr, val, next_val,...
         SWAP2
-        // stack: val, loc, i + 1, next_val,...
+        // stack: val, addr, i + 1, next_val,...
         POP
-        // stack: loc, i + 1, next_val,...
+        // stack: addr, i + 1, next_val,...
         %increment
-        // stack: loc + 1, i + 1, next_val,...
+        // stack: addr + 1, i + 1, next_val,...
         SWAP1
-        // stack: i + 1, loc + 1, next_val,...
+        // stack: i + 1, addr + 1, next_val,...
     %endrep
     // stack: 8, start + 16, rounds, retdest
     %pop2
diff --git a/evm/src/cpu/kernel/asm/hash/blake2/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2/compression.asm
index 454e51280d..ba9ffc1343 100644
--- a/evm/src/cpu/kernel/asm/hash/blake2/compression.asm
+++ b/evm/src/cpu/kernel/asm/hash/blake2/compression.asm
@@ -21,10 +21,11 @@ compression_loop:
     // stack: addr, cur_block, retdest
     POP
     // stack: cur_block, retdest
+    PUSH 1
     PUSH 0
     %mload_current_general
-    // stack: num_blocks, cur_block, retdest
-    %decrement
+    // stack: num_blocks, 1, cur_block, retdest
+    SUB
     // stack: num_blocks - 1, cur_block, retdest
     DUP2
     // stack: cur_block, num_blocks - 1, cur_block, retdest
diff --git a/evm/src/cpu/kernel/asm/hash/blake2/g_functions.asm b/evm/src/cpu/kernel/asm/hash/blake2/g_functions.asm
index 45e54ff43f..d521da6d80 100644
--- a/evm/src/cpu/kernel/asm/hash/blake2/g_functions.asm
+++ b/evm/src/cpu/kernel/asm/hash/blake2/g_functions.asm
@@ -11,28 +11,28 @@
     DUP11
     // stack: start, a, b, c, d, a, b, c, d, x, y, start
     ADD
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: v[a], b, c, d, a, b, c, d, x, y, start
     SWAP1
     // stack: b, v[a], c, d, a, b, c, d, x, y, start
     DUP11
     // stack: start, b, v[a], c, d, a, b, c, d, x, y, start
     ADD
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: v[b], v[a], c, d, a, b, c, d, x, y, start
     SWAP2
     // stack: c, v[a], v[b], d, a, b, c, d, x, y, start
     DUP11
     // stack: start, c, v[a], v[b], d, a, b, c, d, x, y, start
     ADD
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: v[c], v[a], v[b], d, a, b, c, d, x, y, start
     SWAP3
     // stack: d, v[a], v[b], v[c], a, b, c, d, x, y, start
     DUP11
     // stack: start, d, v[a], v[b], v[c], a, b, c, d, x, y, start
     ADD
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: v[d], v[a], v[b], v[c], a, b, c, d, x, y, start
     %stack (vd, vs: 3) -> (vs, vd)
     // stack: v[a], v[b], v[c], v[d], a, b, c, d, x, y, start
@@ -95,13 +95,13 @@
     %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (start, a, va, start, b, vb, start, c, vc, start, d, vd)
     // stack: start, a, v[a]'', start, b, v[b]'', start, c, v[c]'', start, d, v[d]''
     ADD
-    %mstore_current_general
+    %swap_mstore
     ADD
-    %mstore_current_general
+    %swap_mstore
     ADD
-    %mstore_current_general
+    %swap_mstore
     ADD
-    %mstore_current_general
+    %swap_mstore
 %endmacro
 
 %macro call_blake2_g_function(a, b, c, d, x_idx, y_idx)
@@ -113,7 +113,7 @@
     // stack: s[y_idx], round, start
     %blake2_message_addr
     ADD
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: m[s[y_idx]], round, start
     PUSH $x_idx
     DUP3
@@ -122,7 +122,7 @@
     // stack: s[x_idx], m[s[y_idx]], round, start
     %blake2_message_addr
     ADD
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: m[s[x_idx]], m[s[y_idx]], round, start
     %stack (ss: 2, r, s) -> (ss, s, r, s)
     // stack: m[s[x_idx]], m[s[y_idx]], start, round, start
diff --git a/evm/src/cpu/kernel/asm/hash/blake2/hash.asm b/evm/src/cpu/kernel/asm/hash/blake2/hash.asm
index 24ec9caba8..ab0d247633 100644
--- a/evm/src/cpu/kernel/asm/hash/blake2/hash.asm
+++ b/evm/src/cpu/kernel/asm/hash/blake2/hash.asm
@@ -5,13 +5,13 @@ blake2_generate_new_hash_value:
     // stack: addr, i, retdest
     DUP2
     ADD
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: h_i, i, retdest
     %blake2_internal_state_addr
     // stack: addr, h_i, i, retdest
     DUP3
     ADD
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: v_i, h_i, i, retdest
     %blake2_internal_state_addr
     // stack: addr, v_i, h_i, i, retdest
@@ -21,7 +21,7 @@ blake2_generate_new_hash_value:
     // stack: i, addr, h_i, v_i, retdest
     ADD
     %add_const(8)
-    %mload_current_general
+    MLOAD_GENERAL
     // stack: v_(i+8), h_i, v_i, retdest
     XOR
     XOR
diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm
index 5e1ff1f30a..a9467a00bc 100644
--- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm
+++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm
@@ -4,6 +4,7 @@
     // stack: num_blocks
     %mul_const(320)
     %add_const(2)
+    %build_current_general_address
 %endmacro
 
 global sha2_compression:
@@ -24,9 +25,7 @@ global sha2_compression:
     // stack: i=0, message_schedule_addr, a[0]..h[0], retdest
     SWAP1
     // stack: message_schedule_addr, i=0, a[0]..h[0], retdest
-    PUSH 0
-    // stack: 0, message_schedule_addr, i=0, a[0]..h[0], retdest
-    %mload_current_general
+    %mload_current_general_no_offset
     // stack: num_blocks, message_schedule_addr, i=0, a[0]..h[0], retdest
     DUP1
     // stack: num_blocks, num_blocks, message_schedule_addr, i=0, a[0]..h[0], retdest
@@ -53,7 +52,7 @@ compression_loop:
     // stack: 4*i, message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest
     ADD
     // stack: message_schedule_addr + 4*i, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest
-    %mload_current_general_u32
+    %mload_u32
     // stack: W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest
     PUSH sha2_constants_k
     // stack: sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest
diff --git a/evm/src/cpu/kernel/asm/hash/sha2/main.asm b/evm/src/cpu/kernel/asm/hash/sha2/main.asm
index b311262dbd..53967f8a17 100644
--- a/evm/src/cpu/kernel/asm/hash/sha2/main.asm
+++ b/evm/src/cpu/kernel/asm/hash/sha2/main.asm
@@ -1,20 +1,20 @@
 global sha2:
     // stack: virt, num_bytes, retdest
-    SWAP1
-    // stack: num_bytes, virt, retdest
-    DUP2
-    // stack: virt, num_bytes, virt, retdest
-    %mstore_current_general
-    // stack: virt, retdest
+    %build_current_general_address
+    // stack: addr, num_bytes, retdest
+    DUP1 SWAP2
+    // stack: num_bytes, addr, addr, retdest
+    MSTORE_GENERAL
+    // stack: addr, retdest
 
 
-// Precodition: input is in memory, starting at virt of kernel general segment, of the form
+// Precondition: input is in memory, starting at addr of kernel general segment, of the form
 //              num_bytes, x[0], x[1], ..., x[num_bytes - 1]
 // Postcodition: output is in memory, starting at 0, of the form
 //               num_blocks, block0[0], ..., block0[63], block1[0], ..., blocklast[63]
 global sha2_pad:
-    // stack: virt, retdest
-    %mload_current_general
+    // stack: addr, retdest
+    MLOAD_GENERAL
     // stack: num_bytes, retdest
     // STEP 1: append 1
     // insert 128 (= 1 << 7) at x[num_bytes+1]
@@ -31,7 +31,7 @@ global sha2_pad:
     DUP1
     // stack: num_bytes, num_bytes, retdest
     %add_const(8)
-    %div_const(64)
+    %shr_const(6)
     
     %increment
     // stack: num_blocks = (num_bytes+8)//64 + 1, num_bytes, retdest
@@ -50,8 +50,7 @@ global sha2_pad:
     DUP1
     // stack: num_blocks, num_blocks, retdest
     // STEP 5: write num_blocks to x[0]
-    PUSH 0
-    %mstore_current_general
+    %mstore_current_general_no_offset
     // stack: num_blocks, retdest
     %message_schedule_addr_from_num_blocks
     %jump(sha2_gen_all_message_schedules)
diff --git a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm
index c9f542ce5f..66fa67a9b7 100644
--- a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm
+++ b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm
@@ -3,9 +3,10 @@
     // stack: num_blocks
     %mul_const(64)
     %add_const(2)
+    %build_current_general_address
 %endmacro
 
-// Precodition: stack contains address of one message block, followed by output address
+// Precondition: stack contains address of one message block, followed by output address
 // Postcondition: 256 bytes starting at given output address contain the 64 32-bit chunks
 //                of message schedule (in four-byte increments)
 gen_message_schedule_from_block:
@@ -16,18 +17,17 @@ gen_message_schedule_from_block:
     // stack: block_addr + 32, block_addr, output_addr, retdest
     SWAP1
     // stack: block_addr, block_addr + 32, output_addr, retdest
-    %mload_current_general_u256
+    %mload_u256
     // stack: block[0], block_addr + 32, output_addr, retdest
     SWAP1
     // stack: block_addr + 32, block[0], output_addr, retdest
-    %mload_current_general_u256
+    %mload_u256
     // stack: block[1], block[0], output_addr, retdest
     SWAP2
     // stack: output_addr, block[0], block[1], retdest
     %add_const(28)
     PUSH 8
     // stack: counter=8, output_addr + 28, block[0], block[1], retdest
-    %jump(gen_message_schedule_from_block_0_loop)
 gen_message_schedule_from_block_0_loop:
     // Split the first half (256 bits) of the block into the first eight (32-bit) chunks of the message sdchedule.
     // stack: counter, output_addr, block[0], block[1], retdest
@@ -43,7 +43,7 @@ gen_message_schedule_from_block_0_loop:
     // stack: block[0] % (1 << 32), block[0] >> 32, output_addr, counter, block[1], retdest
     DUP3
     // stack: output_addr, block[0] % (1 << 32), block[0] >> 32, output_addr, counter, block[1], retdest
-    %mstore_current_general_u32
+    %mstore_u32
     // stack: block[0] >> 32, output_addr, counter, block[1], retdest
     SWAP1
     // stack: output_addr, block[0] >> 32, counter, block[1], retdest
@@ -81,7 +81,7 @@ gen_message_schedule_from_block_1_loop:
     // stack: block[1] % (1 << 32), block[1] >> 32, output_addr, counter, block[0], retdest
     DUP3
     // stack: output_addr, block[1] % (1 << 32), block[1] >> 32, output_addr, counter, block[0], retdest
-    %mstore_current_general_u32
+    %mstore_u32
     // stack: block[1] >> 32, output_addr, counter, block[0], retdest
     SWAP1
     // stack: output_addr, block[1] >> 32, counter, block[0], retdest
@@ -111,39 +111,43 @@ gen_message_schedule_remaining_loop:
     // stack: counter, output_addr, block[0], block[1], retdest
     SWAP1
     // stack: output_addr, counter, block[0], block[1], retdest
-    DUP1
-    // stack: output_addr, output_addr, counter, block[0], block[1], retdest
-    %sub_const(8)
+    PUSH 8
+    DUP2
+    // stack: output_addr, 2*4, output_addr, counter, block[0], block[1], retdest
+    SUB
     // stack: output_addr - 2*4, output_addr, counter, block[0], block[1], retdest
-    %mload_current_general_u32
+    %mload_u32
     // stack: x[output_addr - 2*4], output_addr, counter, block[0], block[1], retdest
     %sha2_sigma_1
     // stack: sigma_1(x[output_addr - 2*4]), output_addr, counter, block[0], block[1], retdest
     SWAP1
     // stack: output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    DUP1
-    // stack: output_addr, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    %sub_const(28)
+    PUSH 28
+    DUP2
+    // stack: output_addr, 7*4, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    SUB
     // stack: output_addr - 7*4, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    %mload_current_general_u32
+    %mload_u32
     // stack: x[output_addr - 7*4], output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
     SWAP1
     // stack: output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    DUP1
-    // stack: output_addr, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    %sub_const(60)
+    PUSH 60
+    DUP2
+    // stack: output_addr, 15*4, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    SUB
     // stack: output_addr - 15*4, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    %mload_current_general_u32
+    %mload_u32
     // stack: x[output_addr - 15*4], output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
     %sha2_sigma_0
     // stack: sigma_0(x[output_addr - 15*4]), output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
     SWAP1
     // stack: output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    DUP1
-    // stack: output_addr, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    %sub_const(64)
+    PUSH 64
+    DUP2
+    // stack: output_addr, 16*4, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    SUB
     // stack: output_addr - 16*4, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
-    %mload_current_general_u32
+    %mload_u32
     // stack: x[output_addr - 16*4], output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
     SWAP1
     // stack: output_addr, x[output_addr - 16*4], sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
@@ -155,7 +159,7 @@ gen_message_schedule_remaining_loop:
     // stack: sigma_1(x[output_addr - 2*4]) + x[output_addr - 16*4] + sigma_0(x[output_addr - 15*4]) + x[output_addr - 7*4], output_addr, counter, block[0], block[1], retdest
     DUP2
     // stack: output_addr, sigma_1(x[output_addr - 2*4]) + x[output_addr - 16*4] + sigma_0(x[output_addr - 15*4]) + x[output_addr - 7*4], output_addr, counter, block[0], block[1], retdest
-    %mstore_current_general_u32
+    %mstore_u32
     // stack: output_addr, counter, block[0], block[1], retdest
     %add_const(4)
     // stack: output_addr + 4, counter, block[0], block[1], retdest
@@ -178,12 +182,12 @@ global sha2_gen_all_message_schedules:
     // stack: output_addr, retdest
     DUP1
     // stack: output_addr, output_addr, retdest
-    PUSH 0
-    // stack: 0, output_addr, output_addr, retdest
-    %mload_current_general
+    %mload_current_general_no_offset
     // stack: num_blocks, output_addr, output_addr, retdest
     PUSH 1
-    // stack: cur_addr = 1, counter = num_blocks, output_addr, output_addr, retdest
+    // stack: cur_offset = 1, counter = num_blocks, output_addr, output_addr, retdest
+    %build_current_general_address
+    // stack: cur_addr, counter, output_addr, output_addr, retdest
 gen_all_message_schedules_loop:
     // stack: cur_addr, counter, cur_output_addr, output_addr, retdest
     PUSH gen_all_message_schedules_loop_end
diff --git a/evm/src/cpu/kernel/asm/hash/sha2/ops.asm b/evm/src/cpu/kernel/asm/hash/sha2/ops.asm
index 6a4c5e3b77..d50e5c9a89 100644
--- a/evm/src/cpu/kernel/asm/hash/sha2/ops.asm
+++ b/evm/src/cpu/kernel/asm/hash/sha2/ops.asm
@@ -34,7 +34,7 @@
     // stack: rotr(x, 18), x, rotr(x, 7)
     SWAP1
     // stack: x, rotr(x, 18), rotr(x, 7)
-    %div_const(8) // equivalent to %shr_const(3)
+    %shr_const(3)
     // stack: shr(x, 3), rotr(x, 18), rotr(x, 7)
     XOR
     XOR
diff --git a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm
index 438875fb8b..9c2707b8d1 100644
--- a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm
+++ b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm
@@ -1,5 +1,6 @@
 %macro sha2_write_length
-    // stack: last_addr, length
+    // stack: last_addr_offset, length
+    %build_current_general_address
     SWAP1
     // stack: length, last_addr
     DUP1
@@ -8,7 +9,7 @@
     // stack: length % (1 << 8), length, last_addr
     DUP3
     // stack: last_addr, length % (1 << 8), length, last_addr
-    %mstore_current_general
+    %swap_mstore
     
     %rep 7
         // For i = 0 to 6
@@ -17,15 +18,16 @@
         %decrement
         SWAP1
         // stack: length >> (8 * i), last_addr - i - 2
-        %div_const(256) // equivalent to %shr_const(8)
+        %shr_const(8)
         // stack: length >> (8 * (i + 1)), last_addr - i - 2
-        DUP1
-        // stack: length >> (8 * (i + 1)), length >> (8 * (i + 1)), last_addr - i - 2
-        %mod_const(256)
+        PUSH 256
+        DUP2
+        // stack: length >> (8 * (i + 1)), 256, length >> (8 * (i + 1)), last_addr - i - 2
+        MOD
         // stack: (length >> (8 * (i + 1))) % (1 << 8), length >> (8 * (i + 1)), last_addr - i - 2
         DUP3
         // stack: last_addr - i - 2, (length >> (8 * (i + 1))) % (1 << 8), length >> (8 * (i + 1)), last_addr - i - 2
-        %mstore_current_general
+        %swap_mstore
     %endrep
 
     %pop2
diff --git a/evm/src/cpu/kernel/asm/journal/journal.asm b/evm/src/cpu/kernel/asm/journal/journal.asm
index a0e5502dc6..9ba4350878 100644
--- a/evm/src/cpu/kernel/asm/journal/journal.asm
+++ b/evm/src/cpu/kernel/asm/journal/journal.asm
@@ -182,9 +182,12 @@
     // stack: (empty)
     %current_checkpoint
     // stack: current_checkpoint
+    DUP1
+    PUSH @SEGMENT_JOURNAL_CHECKPOINTS
+    %build_kernel_address
     %journal_size
-    // stack: journal_size, current_checkpoint
-    DUP2 %mstore_kernel(@SEGMENT_JOURNAL_CHECKPOINTS)
+    // stack: journal_size, addr, current_checkpoint
+    MSTORE_GENERAL
     // stack: current_checkpoint
     %mload_context_metadata(@CTX_METADATA_CHECKPOINTS_LEN)
     // stack: i, current_checkpoint
@@ -199,8 +202,9 @@
 %endmacro
 
 %macro pop_checkpoint
+    PUSH 1
     %mload_context_metadata(@CTX_METADATA_CHECKPOINTS_LEN)
     // stack: i
-    %decrement
+    SUB
     %mstore_context_metadata(@CTX_METADATA_CHECKPOINTS_LEN)
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/journal/log.asm b/evm/src/cpu/kernel/asm/journal/log.asm
index 0b815faef6..e1794397b7 100644
--- a/evm/src/cpu/kernel/asm/journal/log.asm
+++ b/evm/src/cpu/kernel/asm/journal/log.asm
@@ -8,8 +8,9 @@ global revert_log:
     // stack: entry_type, ptr, retdest
     POP
     // First, reduce the number of logs.
+    PUSH 1
     %mload_global_metadata(@GLOBAL_METADATA_LOGS_LEN)
-    %decrement
+    SUB
     %mstore_global_metadata(@GLOBAL_METADATA_LOGS_LEN)
     // stack: ptr, retdest
     // Second, restore payload length.
diff --git a/evm/src/cpu/kernel/asm/main.asm b/evm/src/cpu/kernel/asm/main.asm
index bd555218be..d78152f4be 100644
--- a/evm/src/cpu/kernel/asm/main.asm
+++ b/evm/src/cpu/kernel/asm/main.asm
@@ -1,52 +1,85 @@
 global main:
-    // First, initialise the shift table
+    // First, hash the kernel code
+    %mload_global_metadata(@GLOBAL_METADATA_KERNEL_LEN)
+    PUSH 0
+    // stack: addr, len
+    KECCAK_GENERAL
+    // stack: hash
+    %mload_global_metadata(@GLOBAL_METADATA_KERNEL_HASH)
+    // stack: expected_hash, hash
+    %assert_eq
+
+    // Initialise the shift table
     %shift_table_init
 
-    // Initialize the block bloom filter
-    %initialize_block_bloom
+    // Initialize the RLP DATA pointer to its initial position (ctx == virt == 0, segment = RLP)
+    PUSH @SEGMENT_RLP_RAW
+    %mstore_global_metadata(@GLOBAL_METADATA_RLP_DATA_SIZE)
 
-    // Second, load all MPT data from the prover.
-    PUSH hash_initial_tries
-    %jump(load_all_mpts)
+    // Encode constant nodes
+    %initialize_rlp_segment
+   
+    // Initialize the state, transaction and receipt trie root pointers.
+    PROVER_INPUT(trie_ptr::state)
+    %mstore_global_metadata(@GLOBAL_METADATA_STATE_TRIE_ROOT)
+    PROVER_INPUT(trie_ptr::txn)
+    %mstore_global_metadata(@GLOBAL_METADATA_TXN_TRIE_ROOT)
+    PROVER_INPUT(trie_ptr::receipt)
+    %mstore_global_metadata(@GLOBAL_METADATA_RECEIPT_TRIE_ROOT)
 
 global hash_initial_tries:
-    %mpt_hash_state_trie   %mload_global_metadata(@GLOBAL_METADATA_STATE_TRIE_DIGEST_BEFORE)    %assert_eq
+    // We compute the length of the trie data segment in `mpt_hash` so that we
+    // can check the value provided by the prover.
+    // We initialize the segment length with 1 because the segment contains 
+    // the null pointer `0` when the tries are empty.
+    PUSH 1
+    %mpt_hash_state_trie  %mload_global_metadata(@GLOBAL_METADATA_STATE_TRIE_DIGEST_BEFORE)    %assert_eq
+    // stack: trie_data_len
     %mpt_hash_txn_trie     %mload_global_metadata(@GLOBAL_METADATA_TXN_TRIE_DIGEST_BEFORE)      %assert_eq
+    // stack: trie_data_len
     %mpt_hash_receipt_trie %mload_global_metadata(@GLOBAL_METADATA_RECEIPT_TRIE_DIGEST_BEFORE)  %assert_eq
+    // stack: trie_data_full_len
+    %mstore_global_metadata(@GLOBAL_METADATA_TRIE_DATA_SIZE)
 
-global start_txns:
+global start_txn:
     // stack: (empty)
     // The special case of an empty trie (i.e. for the first transaction)
     // is handled outside of the kernel.
     %mload_global_metadata(@GLOBAL_METADATA_TXN_NUMBER_BEFORE)
     // stack: txn_nb
-    %mload_global_metadata(@GLOBAL_METADATA_BLOCK_GAS_USED_BEFORE)
-    // stack: init_used_gas, txn_nb
-    DUP2 %scalar_to_rlp
-    // stack: txn_counter, init_gas_used, txn_nb
+    DUP1 %scalar_to_rlp
+    // stack: txn_counter, txn_nb
     DUP1 %num_bytes %mul_const(2)
-    // stack: num_nibbles, txn_counter, init_gas_used, txn_nb
-    SWAP2
-    // stack: init_gas_used, txn_counter, num_nibbles, txn_nb
+    // stack: num_nibbles, txn_counter, txn_nb
+    %increment_bounded_rlp
+    // stack: txn_counter, num_nibbles, next_txn_counter, next_num_nibbles,  txn_nb
+    %mload_global_metadata(@GLOBAL_METADATA_BLOCK_GAS_USED_BEFORE)
+
+    // stack: init_gas_used, txn_counter, num_nibbles, next_txn_counter, next_num_nibbles, txn_nb
 
-txn_loop:
-    // If the prover has no more txns for us to process, halt.
-    PROVER_INPUT(end_of_txns)
-    %jumpi(hash_final_tries)
+    // If the prover has no txn for us to process, halt.
+    PROVER_INPUT(no_txn)
+    %jumpi(execute_withdrawals)
+
+    // Call route_txn. When we return, we will process the txn receipt.
+    PUSH txn_after
+    // stack: retdest, prev_gas_used, txn_counter, num_nibbles, next_txn_counter, next_num_nibbles, txn_nb
+    DUP4 DUP4
 
-    // Call route_txn. When we return, continue the txn loop.
-    PUSH txn_loop_after
-    // stack: retdest, prev_gas_used, txn_counter, num_nibbles, txn_nb
-    DUP4 DUP4 %increment_bounded_rlp
-    %stack (next_txn_counter, next_num_nibbles, retdest, prev_gas_used, txn_counter, num_nibbles) -> (txn_counter, num_nibbles, retdest, prev_gas_used, txn_counter, num_nibbles, next_txn_counter, next_num_nibbles)
     %jump(route_txn)
 
-global txn_loop_after:
+global txn_after:
     // stack: success, leftover_gas, cur_cum_gas, prev_txn_counter, prev_num_nibbles, txn_counter, num_nibbles, txn_nb
     %process_receipt
     // stack: new_cum_gas, txn_counter, num_nibbles, txn_nb
     SWAP3 %increment SWAP3
-    %jump(txn_loop)
+    %jump(execute_withdrawals_post_stack_op)
+
+global execute_withdrawals:
+    // stack: cum_gas, txn_counter, num_nibbles, next_txn_counter, next_num_nibbles, txn_nb
+    %stack (cum_gas, txn_counter, num_nibbles, next_txn_counter, next_num_nibbles) -> (cum_gas, txn_counter, num_nibbles)
+execute_withdrawals_post_stack_op:
+    %withdrawals
 
 global hash_final_tries:
     // stack: cum_gas, txn_counter, num_nibbles, txn_nb
@@ -54,68 +87,10 @@ global hash_final_tries:
     %mload_global_metadata(@GLOBAL_METADATA_BLOCK_GAS_USED_AFTER) %assert_eq
     DUP3 %mload_global_metadata(@GLOBAL_METADATA_TXN_NUMBER_AFTER) %assert_eq
     %pop3
-    %check_metadata_block_bloom
+    PUSH 1 // initial trie data length 
     %mpt_hash_state_trie   %mload_global_metadata(@GLOBAL_METADATA_STATE_TRIE_DIGEST_AFTER)     %assert_eq
     %mpt_hash_txn_trie     %mload_global_metadata(@GLOBAL_METADATA_TXN_TRIE_DIGEST_AFTER)       %assert_eq
     %mpt_hash_receipt_trie %mload_global_metadata(@GLOBAL_METADATA_RECEIPT_TRIE_DIGEST_AFTER)   %assert_eq
+    // We don't need the trie data length here.
+    POP
     %jump(halt)
-
-initialize_block_bloom:
-    // stack: retdest
-    PUSH 0 PUSH 8 PUSH 0
-
-initialize_bloom_loop:
-    // stack: i, len, offset, retdest
-    DUP2 DUP2 EQ %jumpi(initialize_bloom_loop_end)
-    PUSH 32 // Bloom word length
-    // stack: word_len, i, len, offset, retdest
-    // Load the next `block_bloom_before` word.
-    DUP2 %add_const(8) %mload_kernel(@SEGMENT_GLOBAL_BLOCK_BLOOM)
-    // stack: bloom_word, word_len, i, len, offset, retdest
-    DUP5 PUSH @SEGMENT_BLOCK_BLOOM PUSH 0 // Bloom word address in SEGMENT_BLOCK_BLOOM
-    %mstore_unpacking
-    // stack: new_offset, i, len, old_offset, retdest
-    SWAP3 POP %increment
-    // stack: i, len, new_offset, retdest
-    %jump(initialize_bloom_loop)
-
-initialize_bloom_loop_end:
-    // stack: len, len, offset, retdest
-    %pop3
-    JUMP
-    
-%macro initialize_block_bloom
-    // stack: (empty)
-    PUSH %%after
-    %jump(initialize_block_bloom)
-%%after:
-%endmacro
-
-check_metadata_block_bloom:
-    // stack: retdest
-    PUSH 0 PUSH 8 PUSH 0
-
-check_bloom_loop:
-    // stack: i, len, offset, retdest
-    DUP2 DUP2 EQ %jumpi(check_bloom_loop_end)
-    PUSH 32 // Bloom word length
-    // stack: word_len, i, len, offset, retdest
-    DUP4 PUSH @SEGMENT_BLOCK_BLOOM PUSH 0
-    %mload_packing
-    // stack: bloom_word, i, len, offset, retdest
-    DUP2 %add_const(16) %mload_kernel(@SEGMENT_GLOBAL_BLOCK_BLOOM) %assert_eq
-    // stack: i, len, offset, retdest
-    %increment SWAP2 %add_const(32) SWAP2
-    // stack: i+1, len, new_offset, retdest
-    %jump(check_bloom_loop)
-
-check_bloom_loop_end:
-    // stack: len, len, offset, retdest
-    %pop3
-    JUMP
-
-%macro check_metadata_block_bloom
-    PUSH %%after
-    %jump(check_metadata_block_bloom)
-%%after:
-%endmacro
diff --git a/evm/src/cpu/kernel/asm/memory/core.asm b/evm/src/cpu/kernel/asm/memory/core.asm
index 41d4927bf7..070e474f6e 100644
--- a/evm/src/cpu/kernel/asm/memory/core.asm
+++ b/evm/src/cpu/kernel/asm/memory/core.asm
@@ -1,39 +1,30 @@
 // Load a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0).
 %macro mload_u32
-    // stack: context, segment, offset
-    %stack (addr: 3) -> (addr, 4, %%after)
-    %jump(mload_packing)
-%%after:
+    // stack: addr
+    %stack (addr) -> (addr, 4)
+    MLOAD_32BYTES
 %endmacro
 
 // Load a little-endian u32, consisting of 4 bytes (c_0, c_1, c_2, c_3).
 %macro mload_u32_LE
-    // stack: context, segment, offset
-    DUP3
-    DUP3
-    DUP3
+    // stack: addr
+    DUP1
     MLOAD_GENERAL
-    // stack: c0, context, segment, offset
-    DUP4
+    // stack: c0, addr
+    DUP2
     %increment
-    DUP4
-    DUP4
     MLOAD_GENERAL
     %shl_const(8)
     ADD
-    // stack: c0 | (c1 << 8), context, segment, offset
-    DUP4
+    // stack: c0 | (c1 << 8), addr
+    DUP2
     %add_const(2)
-    DUP4
-    DUP4
     MLOAD_GENERAL
     %shl_const(16)
     ADD
-    // stack: c0 | (c1 << 8) | (c2 << 16), context, segment, offset
-    SWAP3
-    %add_const(3)
-    SWAP2
+    // stack: c0 | (c1 << 8) | (c2 << 16), addr
     SWAP1
+    %add_const(3)
     MLOAD_GENERAL
     %shl_const(24)
     ADD // OR
@@ -42,16 +33,12 @@
 
 // Load a little-endian u64, consisting of 8 bytes (c_0, ..., c_7).
 %macro mload_u64_LE
-    // stack: context, segment, offset
-    DUP3
-    DUP3
-    DUP3
+    // stack: addr
+    DUP1
     %mload_u32_LE
-    // stack: lo, context, segment, offset
-    SWAP3
-    %add_const(4)
-    SWAP2
+    // stack: lo, addr
     SWAP1
+    %add_const(4)
     %mload_u32_LE
     // stack: hi, lo
     %shl_const(32)
@@ -62,18 +49,15 @@
 
 // Load a big-endian u256.
 %macro mload_u256
-    // stack: context, segment, offset
-    %stack (addr: 3) -> (addr, 32, %%after)
-    %jump(mload_packing)
-%%after:
+    // stack: addr
+    %stack (addr) -> (addr, 32)
+    MLOAD_32BYTES
 %endmacro
 
 // Store a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0).
 %macro mstore_u32
-    // stack: context, segment, offset, value
-    %stack (addr: 3, value) -> (addr, value, 4, %%after)
-    %jump(mstore_unpacking)
-%%after:
+    // stack: addr, value
+    MSTORE_32BYTES_4
     // stack: offset
     POP
 %endmacro
@@ -88,6 +72,7 @@
     // stack: segment, offset
     GET_CONTEXT
     // stack: context, segment, offset
+    %build_address
     MLOAD_GENERAL
     // stack: value
 %endmacro
@@ -102,6 +87,22 @@
     // stack: segment, offset, value
     GET_CONTEXT
     // stack: context, segment, offset, value
+    %build_address
+    SWAP1
+    MSTORE_GENERAL
+    // stack: (empty)
+%endmacro
+
+%macro mstore_current(segment, offset)
+    // stack: value
+    PUSH $offset
+    // stack: offset, value
+    PUSH $segment
+    // stack: segment, offset, value
+    GET_CONTEXT
+    // stack: context, segment, offset, value
+    %build_address
+    SWAP1
     MSTORE_GENERAL
     // stack: (empty)
 %endmacro
@@ -109,7 +110,10 @@
 // Load a single byte from user code.
 %macro mload_current_code
     // stack: offset
-    %mload_current(@SEGMENT_CODE)
+    // SEGMENT_CODE == 0
+    GET_CONTEXT ADD
+    // stack: addr
+    MLOAD_GENERAL
     // stack: value
 %endmacro
 
@@ -120,13 +124,18 @@
     // stack: value
 %endmacro
 
+// Load a single value from the kernel general memory, in the current context (not the kernel's context).
+%macro mload_current_general_no_offset
+    // stack:
+    %build_current_general_address_no_offset
+    MLOAD_GENERAL
+    // stack: value
+%endmacro
+
 // Load a big-endian u32 from kernel general memory in the current context.
 %macro mload_current_general_u32
     // stack: offset
-    PUSH @SEGMENT_KERNEL_GENERAL
-    // stack: segment, offset
-    GET_CONTEXT
-    // stack: context, segment, offset
+    %build_current_general_address
     %mload_u32
     // stack: value
 %endmacro
@@ -134,10 +143,7 @@
 // Load a little-endian u32 from kernel general memory in the current context.
 %macro mload_current_general_u32_LE
     // stack: offset
-    PUSH @SEGMENT_KERNEL_GENERAL
-    // stack: segment, offset
-    GET_CONTEXT
-    // stack: context, segment, offset
+    %build_current_general_address
     %mload_u32_LE
     // stack: value
 %endmacro
@@ -145,10 +151,7 @@
 // Load a little-endian u64 from kernel general memory in the current context.
 %macro mload_current_general_u64_LE
     // stack: offset
-    PUSH @SEGMENT_KERNEL_GENERAL
-    // stack: segment, offset
-    GET_CONTEXT
-    // stack: context, segment, offset
+    %build_current_general_address
     %mload_u64_LE
     // stack: value
 %endmacro
@@ -156,10 +159,7 @@
 // Load a u256 from kernel general memory in the current context.
 %macro mload_current_general_u256
     // stack: offset
-    PUSH @SEGMENT_KERNEL_GENERAL
-    // stack: segment, offset
-    GET_CONTEXT
-    // stack: context, segment, offset
+    %build_current_general_address
     %mload_u256
     // stack: value
 %endmacro
@@ -167,10 +167,17 @@
 // Store a single value to kernel general memory in the current context.
 %macro mstore_current_general
     // stack: offset, value
-    PUSH @SEGMENT_KERNEL_GENERAL
-    // stack: segment, offset, value
-    GET_CONTEXT
-    // stack: context, segment, offset, value
+    %build_current_general_address
+    SWAP1
+    MSTORE_GENERAL
+    // stack: (empty)
+%endmacro
+
+// Store a single value to kernel general memory in the current context.
+%macro mstore_current_general_no_offset
+    // stack: value
+    %build_current_general_address_no_offset
+    SWAP1
     MSTORE_GENERAL
     // stack: (empty)
 %endmacro
@@ -186,10 +193,7 @@
 // Store a big-endian u32 to kernel general memory in the current context.
 %macro mstore_current_general_u32
     // stack: offset, value
-    PUSH @SEGMENT_KERNEL_GENERAL
-    // stack: segment, offset, value
-    GET_CONTEXT
-    // stack: context, segment, offset, value
+    %build_current_general_address
     %mstore_u32
     // stack: (empty)
 %endmacro
@@ -209,8 +213,16 @@
     // stack: offset
     PUSH $segment
     // stack: segment, offset
-    PUSH 0 // kernel has context 0
-    // stack: context, segment, offset
+    %build_kernel_address
+    MLOAD_GENERAL
+    // stack: value
+%endmacro
+
+// Load a single value from the given segment of kernel (context 0) memory.
+%macro mload_kernel_no_offset(segment)
+    // stack: empty
+    PUSH $segment
+    // stack: addr
     MLOAD_GENERAL
     // stack: value
 %endmacro
@@ -220,8 +232,19 @@
     // stack: offset, value
     PUSH $segment
     // stack: segment, offset, value
-    PUSH 0 // kernel has context 0
-    // stack: context, segment, offset, value
+    %build_kernel_address
+    // stack: addr, value
+    SWAP1
+    MSTORE_GENERAL
+    // stack: (empty)
+%endmacro
+
+// Store a single value from the given segment of kernel (context 0) memory.
+%macro mstore_kernel_no_offset(segment)
+    // stack: value
+    PUSH $segment
+    // stack: addr, value
+    SWAP1
     MSTORE_GENERAL
     // stack: (empty)
 %endmacro
@@ -233,8 +256,9 @@
     // stack: offset, value
     PUSH $segment
     // stack: segment, offset, value
-    PUSH 0 // kernel has context 0
-    // stack: context, segment, offset, value
+    %build_kernel_address
+    // stack: addr, value
+    SWAP1
     MSTORE_GENERAL
     // stack: (empty)
 %endmacro
@@ -244,8 +268,7 @@
     // stack: offset
     PUSH $segment
     // stack: segment, offset
-    PUSH 0 // kernel has context 0
-    // stack: context, segment, offset
+    %build_kernel_address
     %mload_u32
 %endmacro
 
@@ -254,8 +277,7 @@
     // stack: offset
     PUSH $segment
     // stack: segment, offset
-    PUSH 0 // kernel has context 0
-    // stack: context, segment, offset
+    %build_kernel_address
     %mload_u32_LE
 %endmacro
 
@@ -264,8 +286,7 @@
     // stack: offset
     PUSH $segment
     // stack: segment, offset
-    PUSH 0 // kernel has context 0
-    // stack: context, segment, offset
+    %build_kernel_address
     %mload_u64_LE
 %endmacro
 
@@ -274,8 +295,7 @@
     // stack: offset
     PUSH $segment
     // stack: segment, offset
-    PUSH 0 // kernel has context 0
-    // stack: context, segment, offset
+    %build_kernel_address
     %mload_u256
 %endmacro
 
@@ -285,15 +305,16 @@
     // stack: offset, value
     PUSH $segment
     // stack: segment, offset, value
-    PUSH 0 // kernel has context 0
-    // stack: context, segment, offset, value
+    %build_kernel_address
+    // stack: addr, value
     %mstore_u32
 %endmacro
 
 // Load a single byte from kernel code.
 %macro mload_kernel_code
     // stack: offset
-    %mload_kernel(@SEGMENT_CODE)
+    // ctx == SEGMENT_CODE == 0
+    MLOAD_GENERAL
     // stack: value
 %endmacro
 
@@ -310,7 +331,8 @@
 // from kernel code.
 %macro mload_kernel_code_u32
     // stack: offset
-    %mload_kernel_u32(@SEGMENT_CODE)
+    // ctx == SEGMENT_CODE == 0
+    %mload_u32
     // stack: value
 %endmacro
 
@@ -321,7 +343,8 @@
     PUSH $label
     ADD
     // stack: offset
-    %mload_kernel_u32(@SEGMENT_CODE)
+    // ctx == SEGMENT_CODE == 0
+    %mload_u32
     // stack: value
 %endmacro
 
@@ -366,7 +389,8 @@
 // Load a u256 (big-endian) from kernel code.
 %macro mload_kernel_code_u256
     // stack: offset
-    %mload_kernel_u256(@SEGMENT_CODE)
+    // ctx == SEGMENT_CODE == 0
+    %mload_u256
     // stack: value
 %endmacro
 
@@ -380,7 +404,8 @@
 // Store a single byte to kernel code.
 %macro mstore_kernel_code
     // stack: offset, value
-    %mstore_kernel(@SEGMENT_CODE)
+    // ctx == SEGMENT_CODE == 0
+    MSTORE_GENERAL
     // stack: (empty)
 %endmacro
 
@@ -388,13 +413,14 @@
 // to kernel code.
 %macro mstore_kernel_code_u32
     // stack: offset, value
-    %mstore_kernel_u32(@SEGMENT_CODE)
+    // ctx == SEGMENT_CODE == 0
+    %mstore_u32
 %endmacro
 
-// Store a single byte to @SEGMENT_RLP_RAW.
-%macro mstore_rlp
-    // stack: offset, value
-    %mstore_kernel(@SEGMENT_RLP_RAW)
+%macro swap_mstore
+    // stack: addr, value
+    SWAP1
+    MSTORE_GENERAL
     // stack: (empty)
 %endmacro
 
diff --git a/evm/src/cpu/kernel/asm/memory/memcpy.asm b/evm/src/cpu/kernel/asm/memory/memcpy.asm
index e737dc33ca..a7819bf6e8 100644
--- a/evm/src/cpu/kernel/asm/memory/memcpy.asm
+++ b/evm/src/cpu/kernel/asm/memory/memcpy.asm
@@ -1,55 +1,36 @@
-// Copies `count` values from
-//     SRC = (src_ctx, src_segment, src_addr)
-// to
-//     DST = (dst_ctx, dst_segment, dst_addr).
-// These tuple definitions are used for brevity in the stack comments below.
+// Copies `count` values from SRC to DST.
 global memcpy:
     // stack: DST, SRC, count, retdest
-    DUP7
+    DUP3
     // stack: count, DST, SRC, count, retdest
     ISZERO
     // stack: count == 0, DST, SRC, count, retdest
     %jumpi(memcpy_finish)
     // stack: DST, SRC, count, retdest
+    DUP1
 
     // Copy the next value.
-    DUP6
-    DUP6
-    DUP6
-    // stack: SRC, DST, SRC, count, retdest
+    DUP3
+    // stack: SRC, DST, DST, SRC, count, retdest
     MLOAD_GENERAL
-    // stack: value, DST, SRC, count, retdest
-    DUP4
-    DUP4
-    DUP4
-    // stack: DST, value, DST, SRC, count, retdest
+    // stack: value, DST, DST, SRC, count, retdest
     MSTORE_GENERAL
     // stack: DST, SRC, count, retdest
 
     // Increment dst_addr.
-    SWAP2
     %increment
-    SWAP2
     // Increment src_addr.
-    SWAP5
+    SWAP1
     %increment
-    SWAP5
+    SWAP1
     // Decrement count.
-    SWAP6
-    %decrement
-    SWAP6
+    PUSH 1 DUP4 SUB SWAP3 POP
 
     // Continue the loop.
     %jump(memcpy)
 
-memcpy_finish:
-    // stack: DST, SRC, count, retdest
-    %pop7
-    // stack: retdest
-    JUMP
-
 %macro memcpy
-    %stack (dst: 3, src: 3, count) -> (dst, src, count, %%after)
+    %stack (dst, src, count) -> (dst, src, count, %%after)
     %jump(memcpy)
 %%after:
 %endmacro
@@ -58,51 +39,31 @@ memcpy_finish:
 global memcpy_bytes:
     // stack: DST, SRC, count, retdest
 
-    // Handle empty case
-    DUP7
-    // stack: count, DST, SRC, count, retdest
-    ISZERO
-    // stack: count == 0, DST, SRC, count, retdest
-    %jumpi(memcpy_bytes_empty)
-
-    // stack: DST, SRC, count, retdest
-
     // Handle small case
-    DUP7
+    DUP3
     // stack: count, DST, SRC, count, retdest
-    %lt_const(0x20)
-    // stack: count < 32, DST, SRC, count, retdest
+    %lt_const(0x21)
+    // stack: count <= 32, DST, SRC, count, retdest
     %jumpi(memcpy_bytes_finish)
     
     // We will pack 32 bytes into a U256 from the source, and then unpack it at the destination.
     // Copy the next chunk of bytes.
+    // stack: DST, SRC, count, retdest
     PUSH 32
-    DUP1
-    DUP8
-    DUP8
-    DUP8
-    // stack: SRC, 32, 32, DST, SRC, count, retdest
+    DUP3
+    // stack: SRC, 32, DST, SRC, count, retdest
     MLOAD_32BYTES
-    // stack: value, 32, DST, SRC, count, retdest
-    DUP5
-    DUP5
-    DUP5
-    // stack: DST, value, 32, DST, SRC, count, retdest
-    MSTORE_32BYTES
-    // stack: DST, SRC, count, retdest
-
-    // Increment dst_addr by 32.
-    SWAP2
-    %add_const(0x20)
-    SWAP2
-    // Increment src_addr by 32.
-    SWAP5
+    // stack: value, DST, SRC, count, retdest
+    SWAP1
+    // stack: DST, value, SRC, count, retdest
+    MSTORE_32BYTES_32
+    // stack: DST', SRC, count, retdest
+    // Increment SRC by 32.
+    SWAP1
     %add_const(0x20)
-    SWAP5
+    SWAP1
     // Decrement count by 32.
-    SWAP6
-    %sub_const(0x20)
-    SWAP6
+    PUSH 32 DUP4 SUB SWAP3 POP
 
     // Continue the loop.
     %jump(memcpy_bytes)
@@ -110,34 +71,36 @@ global memcpy_bytes:
 memcpy_bytes_finish:
     // stack: DST, SRC, count, retdest
 
+    // Handle empty case
+    DUP3
+    // stack: count, DST, SRC, count, retdest
+    ISZERO
+    // stack: count == 0, DST, SRC, count, retdest
+    %jumpi(memcpy_finish)
+
+    // stack: DST, SRC, count, retdest
+
     // Copy the last chunk of `count` bytes.
-    DUP7
+    DUP3
     DUP1
-    DUP8
-    DUP8
-    DUP8
+    DUP4
     // stack: SRC, count, count, DST, SRC, count, retdest
     MLOAD_32BYTES
     // stack: value, count, DST, SRC, count, retdest
-    DUP5
-    DUP5
-    DUP5
+    DUP3
     // stack: DST, value, count, DST, SRC, count, retdest
-    MSTORE_32BYTES
-    // stack: DST, SRC, count, retdest
+    %mstore_unpacking
+    // stack: new_offset, DST, SRC, count, retdest
+    POP
 
-    %pop7
-    // stack: retdest
-    JUMP
-
-memcpy_bytes_empty:
-    // stack: DST, SRC, 0, retdest
-    %pop7
+memcpy_finish:
+    // stack: DST, SRC, count, retdest
+    %pop3
     // stack: retdest
     JUMP
 
 %macro memcpy_bytes
-    %stack (dst: 3, src: 3, count) -> (dst, src, count, %%after)
+    %stack (dst, src, count) -> (dst, src, count, %%after)
     %jump(memcpy_bytes)
 %%after:
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/memory/memset.asm b/evm/src/cpu/kernel/asm/memory/memset.asm
index b8d4410708..792aeabc68 100644
--- a/evm/src/cpu/kernel/asm/memory/memset.asm
+++ b/evm/src/cpu/kernel/asm/memory/memset.asm
@@ -1,63 +1,49 @@
-// Sets `count` values to 0 at
-//     DST = (dst_ctx, dst_segment, dst_addr).
-// This tuple definition is used for brevity in the stack comments below.
+// Sets `count` values to 0 at DST.
 global memset:
     // stack: DST, count, retdest
 
-    // Handle empty case
-    DUP4
-    // stack: count, DST, count, retdest
-    ISZERO
-    // stack: count == 0, DST, count, retdest
-    %jumpi(memset_bytes_empty)
-
-    // stack: DST, count, retdest
-
     // Handle small case
-    DUP4
+    DUP2
     // stack: count, DST, count, retdest
-    %lt_const(0x20)
-    // stack: count < 32, DST, count, retdest
+    %lt_const(0x21)
+    // stack: count <= 32, DST, count, retdest
     %jumpi(memset_finish)
 
     // stack: DST, count, retdest
-    PUSH 32
     PUSH 0
-    DUP5
-    DUP5
-    DUP5
-    // stack: DST, 0, 32, DST, count, retdest
-    MSTORE_32BYTES
-    // stack: DST, count, retdest
-
-    // Increment dst_addr.
-    SWAP2
-    %add_const(0x20)
-    SWAP2
+    SWAP1
+    // stack: DST, 0, count, retdest
+    MSTORE_32BYTES_32
+    // stack: DST', count, retdest
     // Decrement count.
-    SWAP3
-    %sub_const(0x20)
-    SWAP3
+    PUSH 32 DUP3 SUB SWAP2 POP
 
     // Continue the loop.
     %jump(memset)
 
 memset_finish:
     // stack: DST, final_count, retdest
-    DUP4
+
+    // Handle empty case
+    DUP2
+    // stack: final_count, DST, final_count, retdest
+    ISZERO
+    // stack: final_count == 0, DST, final_count, retdest
+    %jumpi(memset_bytes_empty)
+
+    // stack: DST, final_count, retdest
+    DUP2
     PUSH 0
-    DUP5
-    DUP5
-    DUP5
+    DUP3
     // stack: DST, 0, final_count, DST, final_count, retdest
-    MSTORE_32BYTES
+    %mstore_unpacking
     // stack: DST, final_count, retdest
-    %pop4
+    %pop3
     // stack: retdest
     JUMP
 
 memset_bytes_empty:
     // stack: DST, 0, retdest
-    %pop4
+    %pop2
     // stack: retdest
     JUMP
diff --git a/evm/src/cpu/kernel/asm/memory/metadata.asm b/evm/src/cpu/kernel/asm/memory/metadata.asm
index 203fd06ce3..f2dc897a1d 100644
--- a/evm/src/cpu/kernel/asm/memory/metadata.asm
+++ b/evm/src/cpu/kernel/asm/memory/metadata.asm
@@ -1,62 +1,104 @@
 // Load the given global metadata field from memory.
 %macro mload_global_metadata(field)
+    // Global metadata are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     // stack: (empty)
     PUSH $field
-    // stack: offset
-    %mload_kernel(@SEGMENT_GLOBAL_METADATA)
+    MLOAD_GENERAL
     // stack: value
 %endmacro
 
 // Store the given global metadata field to memory.
 %macro mstore_global_metadata(field)
+    // Global metadata are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     // stack: value
     PUSH $field
-    // stack: offset, value
-    %mstore_kernel(@SEGMENT_GLOBAL_METADATA)
+    SWAP1
+    MSTORE_GENERAL
     // stack: (empty)
 %endmacro
 
 // Load the given context metadata field from memory.
 %macro mload_context_metadata(field)
+    // Context metadata are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     // stack: (empty)
     PUSH $field
-    // stack: offset
-    %mload_current(@SEGMENT_CONTEXT_METADATA)
+    GET_CONTEXT
+    ADD 
+    // stack: addr
+    MLOAD_GENERAL
     // stack: value
 %endmacro
 
 // Store the given context metadata field to memory.
 %macro mstore_context_metadata(field)
+    // Context metadata are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     // stack: value
     PUSH $field
-    // stack: offset, value
-    %mstore_current(@SEGMENT_CONTEXT_METADATA)
+    GET_CONTEXT
+    ADD 
+    // stack: addr, value
+    SWAP1
+    MSTORE_GENERAL
     // stack: (empty)
 %endmacro
 
 // Store the given context metadata field to memory.
 %macro mstore_context_metadata(field, value)
-    PUSH $value
+    // Context metadata are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     PUSH $field
-    // stack: offset, value
-    %mstore_current(@SEGMENT_CONTEXT_METADATA)
+    GET_CONTEXT
+    ADD 
+    // stack: addr
+    PUSH $value
+    // stack: value, addr
+    MSTORE_GENERAL
     // stack: (empty)
 %endmacro
 
 %macro mstore_parent_context_metadata(field)
+    // Context metadata are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     // stack: value
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx, value) ->
-        (parent_ctx, @SEGMENT_CONTEXT_METADATA, $field, value)
+
+    // stack: parent_ctx, value
+    PUSH $field ADD
+    // stack: addr, value
+    SWAP1
     MSTORE_GENERAL
     // stack: (empty)
 %endmacro
 
 %macro mstore_parent_context_metadata(field, value)
+    // Context metadata are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     // stack: (empty)
     %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT)
-    %stack (parent_ctx) ->
-        (parent_ctx, @SEGMENT_CONTEXT_METADATA, $field, $value)
+
+    // stack: parent_ctx
+    PUSH $field ADD
+    // stack: addr
+    PUSH $value
+    // stack: value, addr
     MSTORE_GENERAL
     // stack: (empty)
 %endmacro
@@ -330,7 +372,7 @@ zero_hash:
     // stack: num_bytes
     %add_const(31)
     // stack: 31 + num_bytes
-    %div_const(32)
+    %shr_const(5)
     // stack: (num_bytes + 31) / 32
 %endmacro
 
@@ -343,7 +385,7 @@ zero_hash:
     SWAP1
     // stack: num_words, num_words * GAS_MEMORY
     %square
-    %div_const(512)
+    %shr_const(9)
     // stack: num_words^2 / 512, num_words * GAS_MEMORY
     ADD
     // stack: cost = num_words^2 / 512 + num_words * GAS_MEMORY
@@ -393,8 +435,9 @@ zero_hash:
 %endmacro
 
 %macro decrement_call_depth
+    PUSH 1
     %mload_global_metadata(@GLOBAL_METADATA_CALL_STACK_DEPTH)
-    %decrement
+    SUB
     %mstore_global_metadata(@GLOBAL_METADATA_CALL_STACK_DEPTH)
 %endmacro
 
diff --git a/evm/src/cpu/kernel/asm/memory/packing.asm b/evm/src/cpu/kernel/asm/memory/packing.asm
index 1dbbf39362..a1bf5a09ad 100644
--- a/evm/src/cpu/kernel/asm/memory/packing.asm
+++ b/evm/src/cpu/kernel/asm/memory/packing.asm
@@ -1,92 +1,321 @@
 // Methods for encoding integers as bytes in memory, as well as the reverse,
-// decoding bytes as integers. All big-endian.
-
-// Given a pointer to some bytes in memory, pack them into a word. Assumes 0 < len <= 32.
-// Pre stack: addr: 3, len, retdest
-// Post stack: packed_value
-// NOTE: addr: 3 denotes a (context, segment, virtual) tuple
-global mload_packing:
-    // stack: addr: 3, len, retdest
-    MLOAD_32BYTES
-    // stack: packed_value, retdest
-    SWAP1
-    // stack: retdest, packed_value
-    JUMP
-
-%macro mload_packing
-    %stack (addr: 3, len) -> (addr, len, %%after)
-    %jump(mload_packing)
-%%after:
-%endmacro
+// decoding bytes as integers. All big-endian unless specified.
 
 global mload_packing_u64_LE:
-    // stack: context, segment, offset, retdest
-    DUP3                DUP3 DUP3 MLOAD_GENERAL
-    DUP4 %add_const(1)  DUP4 DUP4 MLOAD_GENERAL %shl_const( 8) ADD
-    DUP4 %add_const(2)  DUP4 DUP4 MLOAD_GENERAL %shl_const(16) ADD
-    DUP4 %add_const(3)  DUP4 DUP4 MLOAD_GENERAL %shl_const(24) ADD
-    DUP4 %add_const(4)  DUP4 DUP4 MLOAD_GENERAL %shl_const(32) ADD
-    DUP4 %add_const(5)  DUP4 DUP4 MLOAD_GENERAL %shl_const(40) ADD
-    DUP4 %add_const(6)  DUP4 DUP4 MLOAD_GENERAL %shl_const(48) ADD
-    DUP4 %add_const(7)  DUP4 DUP4 MLOAD_GENERAL %shl_const(56) ADD
-    %stack (value, context, segment, offset, retdest) -> (retdest, value)
+    // stack: addr, retdest
+    DUP1                MLOAD_GENERAL
+    DUP2 %add_const(1)  MLOAD_GENERAL %shl_const( 8) ADD
+    DUP2 %add_const(2)  MLOAD_GENERAL %shl_const(16) ADD
+    DUP2 %add_const(3)  MLOAD_GENERAL %shl_const(24) ADD
+    DUP2 %add_const(4)  MLOAD_GENERAL %shl_const(32) ADD
+    DUP2 %add_const(5)  MLOAD_GENERAL %shl_const(40) ADD
+    DUP2 %add_const(6)  MLOAD_GENERAL %shl_const(48) ADD
+    DUP2 %add_const(7)  MLOAD_GENERAL %shl_const(56) ADD
+    %stack (value, addr, retdest) -> (retdest, value)
     JUMP
 
 %macro mload_packing_u64_LE
-    %stack (addr: 3) -> (addr, %%after)
+    %stack (addr) -> (addr, %%after)
     %jump(mload_packing_u64_LE)
 %%after:
 %endmacro
 
-// Pre stack: context, segment, offset, value, len, retdest
-// Post stack: offset'
+// Pre stack: addr, value, len, retdest
+// Post stack: addr'
 global mstore_unpacking:
-    // stack: context, segment, offset, value, len, retdest
-    %stack(context, segment, offset, value, len, retdest) -> (context, segment, offset, value, len, offset, len, retdest)
-    // stack: context, segment, offset, value, len, offset, len, retdest
-    MSTORE_32BYTES
-    // stack: offset, len, retdest
-    ADD SWAP1
-    // stack: retdest, offset'
+    // stack: addr, value, len, retdest
+    DUP3 ISZERO
+    // stack: len == 0, addr, value, len, retdest
+    %jumpi(mstore_unpacking_empty)
+    %stack(addr, value, len, retdest) -> (len, addr, value, retdest)
+    PUSH 3
+    // stack: BYTES_PER_JUMP, len, addr, value, retdest
+    MUL
+    // stack: jump_offset, addr, value, retdest
+    PUSH mstore_unpacking_0
+    // stack: mstore_unpacking_0, jump_offset, addr, value, retdest
+    ADD
+    // stack: address_unpacking, addr, value, retdest
+    JUMP
+
+mstore_unpacking_empty:
+    %stack(addr, value, len, retdest) -> (retdest, addr)
+    JUMP
+
+// This case can never be reached. It's only here to offset the table correctly.
+mstore_unpacking_0:
+    %rep 3
+        PANIC
+    %endrep
+mstore_unpacking_1:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_1
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_2:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_2
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_3:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_3
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_4:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_4
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_5:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_5
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_6:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_6
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_7:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_7
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_8:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_8
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_9:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_9
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_10:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_10
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_11:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_11
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_12:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_12
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_13:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_13
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_14:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_14
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_15:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_15
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_16:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_16
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_17:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_17
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_18:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_18
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_19:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_19
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_20:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_20
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_21:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_21
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_22:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_22
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_23:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_23
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_24:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_24
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_25:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_25
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_26:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_26
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_27:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_27
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_28:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_28
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_29:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_29
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_30:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_30
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_31:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_31
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
+    JUMP
+mstore_unpacking_32:
+    // stack: addr, value, retdest
+    MSTORE_32BYTES_32
+    // stack: addr', retdest
+    SWAP1
+    // stack: retdest, addr'
     JUMP
 
 %macro mstore_unpacking
-    %stack (addr: 3, value, len) -> (addr, value, len, %%after)
+    %stack (addr, value, len) -> (addr, value, len, %%after)
     %jump(mstore_unpacking)
 %%after:
 %endmacro
 
-// Pre stack: context, segment, offset, value, retdest
-// Post stack: offset'
+// Pre stack: addr, value, retdest
+// Post stack: addr'
 global mstore_unpacking_u64_LE:
-    %stack (context, segment, offset, value) -> (0xff, value, context, segment, offset, value)
+    %stack (addr, value) -> (0xff, value, addr, addr, value)
     AND
-    DUP4 DUP4 DUP4 MSTORE_GENERAL // First byte
-    %stack (context, segment, offset, value) -> (0xff00, value, context, segment, offset, value)
+    MSTORE_GENERAL // First byte
+    DUP1 %add_const(1)
+    %stack (new_addr, addr, value) -> (0xff00, value, new_addr, addr, value)
     AND %shr_const(8)
-    DUP4 %add_const(1) DUP4 DUP4 MSTORE_GENERAL // Second byte
-    %stack (context, segment, offset, value) -> (0xff0000, value, context, segment, offset, value)
+    MSTORE_GENERAL // Second byte
+    DUP1 %add_const(2)
+    %stack (new_addr, addr, value) -> (0xff0000, value, new_addr, addr, value)
     AND %shr_const(16)
-    DUP4 %add_const(2) DUP4 DUP4 MSTORE_GENERAL // Third byte
-    %stack (context, segment, offset, value) -> (0xff000000, value, context, segment, offset, value)
+    MSTORE_GENERAL // Third byte
+    DUP1 %add_const(3)
+    %stack (new_addr, addr, value) -> (0xff000000, value, new_addr, addr, value)
     AND %shr_const(24)
-    DUP4 %add_const(3) DUP4 DUP4 MSTORE_GENERAL // Fourth byte
-    %stack (context, segment, offset, value) -> (0xff00000000, value, context, segment, offset, value)
+    MSTORE_GENERAL // Fourth byte
+    DUP1 %add_const(4)
+    %stack (new_addr, addr, value) -> (0xff00000000, value, new_addr, addr, value)
     AND %shr_const(32)
-    DUP4 %add_const(4) DUP4 DUP4 MSTORE_GENERAL // Fifth byte
-    %stack (context, segment, offset, value) -> (0xff0000000000, value, context, segment, offset, value)
+    MSTORE_GENERAL // Fifth byte
+    DUP1 %add_const(5)
+    %stack (new_addr, addr, value) -> (0xff0000000000, value, new_addr, addr, value)
     AND %shr_const(40)
-    DUP4 %add_const(5) DUP4 DUP4 MSTORE_GENERAL // Sixth byte
-    %stack (context, segment, offset, value) -> (0xff000000000000, value, context, segment, offset, value)
+    MSTORE_GENERAL // Sixth byte
+    DUP1 %add_const(6)
+    %stack (new_addr, addr, value) -> (0xff000000000000, value, new_addr, addr, value)
     AND %shr_const(48)
-    DUP4 %add_const(6) DUP4 DUP4 MSTORE_GENERAL // Seventh byte
-    %stack (context, segment, offset, value) -> (0xff00000000000000, value, context, segment, offset, value)
+    MSTORE_GENERAL // Seventh byte
+    DUP1 %add_const(7)
+    %stack (new_addr, addr, value) -> (0xff00000000000000, value, new_addr, addr, value)
     AND %shr_const(56)
-    DUP4 %add_const(7) DUP4 DUP4 MSTORE_GENERAL // Eighth byte
-    %pop4 JUMP
+    MSTORE_GENERAL // Eighth byte
+    %pop2 JUMP
 
 %macro mstore_unpacking_u64_LE
-    %stack (addr: 3, value) -> (addr, value, %%after)
+    %stack (addr, value) -> (addr, value, %%after)
     %jump(mstore_unpacking_u64_LE)
 %%after:
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/memory/syscalls.asm b/evm/src/cpu/kernel/asm/memory/syscalls.asm
index caf0136e8f..97607d1918 100644
--- a/evm/src/cpu/kernel/asm/memory/syscalls.asm
+++ b/evm/src/cpu/kernel/asm/memory/syscalls.asm
@@ -11,7 +11,8 @@ global sys_mload:
     %stack(kexit_info, offset) -> (offset, 32, kexit_info)
     PUSH @SEGMENT_MAIN_MEMORY
     GET_CONTEXT
-    // stack: addr: 3, len, kexit_info
+    %build_address
+    // stack: addr, len, kexit_info
     MLOAD_32BYTES
     %stack (value, kexit_info) -> (kexit_info, value)
     EXIT_KERNEL
@@ -26,11 +27,13 @@ global sys_mstore:
     // stack: expanded_num_bytes, kexit_info, offset, value
     %update_mem_bytes
     // stack: kexit_info, offset, value
-    %stack(kexit_info, offset, value) -> (offset, value, 32, kexit_info)
+    %stack(kexit_info, offset, value) -> (offset, value, kexit_info)
     PUSH @SEGMENT_MAIN_MEMORY
     GET_CONTEXT
-    // stack: addr: 3, value, len, kexit_info
-    MSTORE_32BYTES
+    %build_address
+    // stack: addr, value, kexit_info
+    MSTORE_32BYTES_32
+    POP
     // stack: kexit_info
     EXIT_KERNEL
 
@@ -57,10 +60,11 @@ global sys_calldataload:
     %mload_context_metadata(@CTX_METADATA_CALLDATA_SIZE)
     %stack (calldata_size, kexit_info, i) -> (calldata_size, i, kexit_info, i)
     LT %jumpi(calldataload_large_offset)
-    %stack (kexit_info, i) -> (@SEGMENT_CALLDATA, i, 32, sys_calldataload_after_mload_packing, kexit_info)
+    %stack (kexit_info, i) -> (@SEGMENT_CALLDATA, i, 32, kexit_info)
     GET_CONTEXT
-    // stack: ADDR: 3, 32, sys_calldataload_after_mload_packing, kexit_info
-    %jump(mload_packing)
+    %build_address
+    // stack: addr, 32, kexit_info
+    MLOAD_32BYTES
 sys_calldataload_after_mload_packing:
     // stack: value, kexit_info
     SWAP1
@@ -70,15 +74,10 @@ calldataload_large_offset:
     %stack (kexit_info, i) -> (kexit_info, 0)
     EXIT_KERNEL
 
-// Macro for {CALLDATA,CODE,RETURNDATA}COPY (W_copy in Yellow Paper).
+// Macro for {CALLDATA, RETURNDATA}COPY (W_copy in Yellow Paper).
 %macro wcopy(segment, context_metadata_size)
     // stack: kexit_info, dest_offset, offset, size
-    PUSH @GAS_VERYLOW
-    DUP5
-    // stack: size, Gverylow, kexit_info, dest_offset, offset, size
-    ISZERO %jumpi(wcopy_empty)
-    // stack: Gverylow, kexit_info, dest_offset, offset, size
-    DUP5 %num_bytes_to_num_words %mul_const(@GAS_COPY) ADD %charge_gas
+    %wcopy_charge_gas
 
     %stack (kexit_info, dest_offset, offset, size) -> (dest_offset, size, kexit_info, dest_offset, offset, size)
     %add_or_fault
@@ -92,139 +91,139 @@ calldataload_large_offset:
     // stack: offset, total_size, kexit_info, dest_offset, offset, size
     GT %jumpi(wcopy_large_offset)
 
+    // stack: kexit_info, dest_offset, offset, size
+    GET_CONTEXT
     PUSH $segment
-    %mload_context_metadata($context_metadata_size)
-    // stack: total_size, segment, kexit_info, dest_offset, offset, size
-    DUP6 DUP6 ADD
-    // stack: offset + size, total_size, segment, kexit_info, dest_offset, offset, size
-    LT %jumpi(wcopy_within_bounds)
-
-    %mload_context_metadata($context_metadata_size)
-    // stack: total_size, segment, kexit_info, dest_offset, offset, size
-    DUP6 DUP6 ADD
-    // stack: offset + size, total_size, segment, kexit_info, dest_offset, offset, size
-    SUB // extra_size = offset + size - total_size
-    // stack: extra_size, segment, kexit_info, dest_offset, offset, size
-    DUP1 DUP7 SUB
-    // stack: copy_size = size - extra_size, extra_size, segment, kexit_info, dest_offset, offset, size
-
-    // Compute the new dest_offset after actual copies, at which we will start padding with zeroes.
-    DUP1 DUP6 ADD
-    // stack: new_dest_offset, copy_size, extra_size, segment, kexit_info, dest_offset, offset, size
+    // stack: segment, context, kexit_info, dest_offset, offset, size
+    %jump(wcopy_within_bounds)
+%endmacro
 
-    GET_CONTEXT
-    %stack (context, new_dest_offset, copy_size, extra_size, segment, kexit_info, dest_offset, offset, size) ->
-        (context, @SEGMENT_MAIN_MEMORY, dest_offset, context, segment, offset, copy_size, wcopy_over_range, new_dest_offset, extra_size, kexit_info)
-    %jump(memcpy_bytes)
+%macro wcopy_charge_gas
+    // stack: kexit_info, dest_offset, offset, size
+    PUSH @GAS_VERYLOW
+    DUP5
+    // stack: size, Gverylow, kexit_info, dest_offset, offset, size
+    ISZERO %jumpi(wcopy_empty)
+    // stack: Gverylow, kexit_info, dest_offset, offset, size
+    DUP5 %num_bytes_to_num_words %mul_const(@GAS_COPY) ADD %charge_gas
 %endmacro
 
+
+codecopy_within_bounds:
+    // stack: total_size, segment, src_ctx, kexit_info, dest_offset, offset, size
+    POP
 wcopy_within_bounds:
-    // stack: segment, kexit_info, dest_offset, offset, size
+    // TODO: rework address creation to have less stack manipulation overhead
+    // stack: segment, src_ctx, kexit_info, dest_offset, offset, size
     GET_CONTEXT
-    %stack (context, segment, kexit_info, dest_offset, offset, size) ->
-        (context, @SEGMENT_MAIN_MEMORY, dest_offset, context, segment, offset, size, wcopy_after, kexit_info)
+    %stack (context, segment, src_ctx, kexit_info, dest_offset, offset, size) ->
+        (src_ctx, segment, offset, @SEGMENT_MAIN_MEMORY, dest_offset, context, size, wcopy_after, kexit_info)
+    %build_address
+    SWAP3 %build_address
+    // stack: DST, SRC, size, wcopy_after, kexit_info
     %jump(memcpy_bytes)
 
-
-// Same as wcopy_large_offset, but without `offset` in the stack.
-wcopy_over_range:
-    // stack: dest_offset, size, kexit_info
-    GET_CONTEXT
-    %stack (context, dest_offset, size, kexit_info) ->
-        (context, @SEGMENT_MAIN_MEMORY, dest_offset, size, wcopy_after, kexit_info)
-    %jump(memset)
-
 wcopy_empty:
     // stack: Gverylow, kexit_info, dest_offset, offset, size
     %charge_gas
     %stack (kexit_info, dest_offset, offset, size) -> (kexit_info)
     EXIT_KERNEL
 
+
+codecopy_large_offset:
+    // stack: total_size, src_ctx, kexit_info, dest_offset, offset, size
+    %pop2
 wcopy_large_offset:
     // offset is larger than the size of the {CALLDATA,CODE,RETURNDATA}. So we just have to write zeros.
     // stack: kexit_info, dest_offset, offset, size
     GET_CONTEXT
     %stack (context, kexit_info, dest_offset, offset, size) ->
         (context, @SEGMENT_MAIN_MEMORY, dest_offset, size, wcopy_after, kexit_info)
+    %build_address
     %jump(memset)
 
 wcopy_after:
     // stack: kexit_info
     EXIT_KERNEL
 
+// Pre stack: kexit_info, dest_offset, offset, size
+// Post stack: (empty)
 global sys_calldatacopy:
     %wcopy(@SEGMENT_CALLDATA, @CTX_METADATA_CALLDATA_SIZE)
 
-global sys_codecopy:
-    %wcopy(@SEGMENT_CODE, @CTX_METADATA_CODE_SIZE)
-
-// Same as %wcopy but with overflow checks.
+// Pre stack: kexit_info, dest_offset, offset, size
+// Post stack: (empty)
 global sys_returndatacopy:
+    DUP4 DUP4 %add_or_fault // Overflow check
+    %mload_context_metadata(@CTX_METADATA_RETURNDATA_SIZE) LT %jumpi(fault_exception) // Data len check
+
+    %wcopy(@SEGMENT_RETURNDATA, @CTX_METADATA_RETURNDATA_SIZE)
+
+// Pre stack: kexit_info, dest_offset, offset, size
+// Post stack: (empty)
+global sys_codecopy:
     // stack: kexit_info, dest_offset, offset, size
-    PUSH @GAS_VERYLOW
-    // stack: Gverylow, kexit_info, dest_offset, offset, size
-    DUP5 %num_bytes_to_num_words %mul_const(@GAS_COPY) ADD %charge_gas
+    %wcopy_charge_gas
 
     %stack (kexit_info, dest_offset, offset, size) -> (dest_offset, size, kexit_info, dest_offset, offset, size)
     %add_or_fault
     // stack: expanded_num_bytes, kexit_info, dest_offset, offset, size, kexit_info
     DUP1 %ensure_reasonable_offset
     %update_mem_bytes
-    // stack: kexit_info, dest_offset, offset, size, kexit_info
-    DUP4 DUP4 %add_or_fault // Overflow check
-    %mload_context_metadata(@CTX_METADATA_RETURNDATA_SIZE) LT %jumpi(fault_exception) // Data len check
 
-    // stack:  kexit_info, dest_offset, offset, size
-    DUP4
-    // stack:  size, kexit_info, dest_offset, offset, size
-    ISZERO %jumpi(returndatacopy_empty)
+    GET_CONTEXT
+    %mload_context_metadata(@CTX_METADATA_CODE_SIZE)
+    // stack: code_size, ctx, kexit_info, dest_offset, offset, size
+    %codecopy_after_checks(@SEGMENT_CODE)
+
+
+// Pre stack: kexit_info, address, dest_offset, offset, size
+// Post stack: (empty)
+global sys_extcodecopy:
+    %stack (kexit_info, address, dest_offset, offset, size)
+        -> (address, dest_offset, offset, size, kexit_info)
+    %u256_to_addr DUP1 %insert_accessed_addresses
+    // stack: cold_access, address, dest_offset, offset, size, kexit_info
+    PUSH @GAS_COLDACCOUNTACCESS_MINUS_WARMACCESS
+    MUL
+    PUSH @GAS_WARMACCESS
+    ADD
+    // stack: Gaccess, address, dest_offset, offset, size, kexit_info
 
-    %mload_context_metadata(@CTX_METADATA_RETURNDATA_SIZE)
-    // stack: total_size, kexit_info, dest_offset, offset, size
-    DUP4
-    // stack: offset, total_size, kexit_info, dest_offset, offset, size
-    GT %jumpi(wcopy_large_offset)
+    DUP5
+    // stack: size, Gaccess, address, dest_offset, offset, size, kexit_info
+    ISZERO %jumpi(sys_extcodecopy_empty)
 
-    PUSH @SEGMENT_RETURNDATA
-    %mload_context_metadata(@CTX_METADATA_RETURNDATA_SIZE)
-    // stack: total_size, returndata_segment, kexit_info, dest_offset, offset, size
-    DUP6 DUP6 ADD
-    // stack: offset + size, total_size, returndata_segment, kexit_info, dest_offset, offset, size
-    LT %jumpi(wcopy_within_bounds)
+    // stack: Gaccess, address, dest_offset, offset, size, kexit_info
+    DUP5 %num_bytes_to_num_words %mul_const(@GAS_COPY) ADD
+    %stack (gas, address, dest_offset, offset, size, kexit_info) -> (gas, kexit_info, address, dest_offset, offset, size)
+    %charge_gas
 
-    %mload_context_metadata(@CTX_METADATA_RETURNDATA_SIZE)
-    // stack: total_size, returndata_segment, kexit_info, dest_offset, offset, size
-    DUP6 DUP6 ADD
-    // stack: offset + size, total_size, returndata_segment, kexit_info, dest_offset, offset, size
-    SUB // extra_size = offset + size - total_size
-    // stack: extra_size, returndata_segment, kexit_info, dest_offset, offset, size
-    DUP1 DUP7 SUB
-    // stack: copy_size = size - extra_size, extra_size, returndata_segment, kexit_info, dest_offset, offset, size
+    %stack (kexit_info, address, dest_offset, offset, size) -> (dest_offset, size, kexit_info, address, dest_offset, offset, size)
+    %add_or_fault
+    // stack: expanded_num_bytes, kexit_info, address, dest_offset, offset, size
+    DUP1 %ensure_reasonable_offset
+    %update_mem_bytes
 
-    // Compute the new dest_offset after actual copies, at which we will start padding with zeroes.
-    DUP1 DUP6 ADD
-    // stack: new_dest_offset, copy_size, extra_size, returndata_segment, kexit_info, dest_offset, offset, size
+    %next_context_id
 
-    GET_CONTEXT
-    %stack (context, new_dest_offset, copy_size, extra_size, returndata_segment, kexit_info, dest_offset, offset, size) ->
-        (context, @SEGMENT_MAIN_MEMORY, dest_offset, context, returndata_segment, offset, copy_size, wcopy_over_range, new_dest_offset, extra_size, kexit_info)
-    %jump(memcpy_bytes)
+    %stack (ctx, kexit_info, address, dest_offset, offset, size) ->
+        (address, ctx, extcodecopy_contd, ctx, kexit_info, dest_offset, offset, size)
+    %jump(load_code)
 
-returndatacopy_empty:
-    %stack (kexit_info, dest_offset, offset, size) -> (kexit_info)
+sys_extcodecopy_empty:
+    %stack (Gaccess, address, dest_offset, offset, size, kexit_info) -> (Gaccess, kexit_info)
+    %charge_gas
     EXIT_KERNEL
 
+extcodecopy_contd:
+    // stack: code_size, ctx, kexit_info, dest_offset, offset, size
+    %codecopy_after_checks(@SEGMENT_CODE)
+
 // Same as %wcopy but with special handling in case of overlapping ranges.
 global sys_mcopy:
     // stack: kexit_info, dest_offset, offset, size
-    PUSH @GAS_VERYLOW
-    // stack: Gverylow, kexit_info, dest_offset, offset, size
-    DUP5 %num_bytes_to_num_words %mul_const(@GAS_COPY) ADD %charge_gas
-
-    // stack: kexit_info, dest_offset, offset, size
-    DUP4
-    // stack: size, kexit_info, dest_offset, offset, size
-    ISZERO %jumpi(returndatacopy_empty) // If size is empty, just pop the stack and exit the kernel
+    %wcopy_charge_gas
 
     %stack (kexit_info, dest_offset, offset, size) -> (dest_offset, size, kexit_info, dest_offset, offset, size)
     %add_or_fault
@@ -235,45 +234,89 @@ global sys_mcopy:
     // stack: kexit_info, dest_offset, offset, size
     DUP3 DUP3 EQ
     // stack: dest_offset = offset, kexit_info, dest_offset, offset, size
-    %jumpi(returndatacopy_empty) // If SRC == DST, just pop the stack and exit the kernel
+    %jumpi(mcopy_empty) // If SRC == DST, just pop the stack and exit the kernel
 
     // stack: kexit_info, dest_offset, offset, size
+    GET_CONTEXT
     PUSH @SEGMENT_MAIN_MEMORY
-    DUP5 DUP5 ADD
-    // stack: offset + size, segment, kexit_info, dest_offset, offset, size
-    DUP4 LT
-    // stack: dest_offset < offset + size, segment, kexit_info, dest_offset, offset, size
-    DUP5 DUP5 GT
-    // stack: dest_offset > offset, dest_offset < offset + size, segment, kexit_info, dest_offset, offset, size
-    AND
-    // stack: (dest_offset > offset) && (dest_offset < offset + size), segment, kexit_info, dest_offset, offset, size
+    DUP6 DUP6 ADD
+    // stack: offset + size, segment, context, kexit_info, dest_offset, offset, size
+    DUP5 LT
+    // stack: dest_offset < offset + size, segment, context, kexit_info, dest_offset, offset, size
+    DUP6 DUP6 GT
+    // stack: dest_offset > offset, dest_offset < offset + size, segment, context, kexit_info, dest_offset, offset, size
+    MUL // AND
+    // stack: (dest_offset > offset) && (dest_offset < offset + size), segment, context, kexit_info, dest_offset, offset, size
 
     // If both conditions are satisfied, that means we will get an overlap, in which case we need to process the copy
     // in two chunks to prevent overwriting memory data before reading it.
     %jumpi(mcopy_with_overlap)
 
-    // stack: segment, kexit_info, dest_offset, offset, size
-    PUSH wcopy_within_bounds
-    JUMP
+    // stack: segment, context, kexit_info, dest_offset, offset, size
+    %jump(wcopy_within_bounds)
 
 mcopy_with_overlap:
     // We do have an overlap between the SRC and DST ranges. We will first copy the overlapping segment
     // (i.e. end of the copy portion), then copy the remaining (i.e. beginning) portion. 
 
-    // stack: segment, kexit_info, dest_offset, offset, size
-    DUP4 DUP4 SUB
-    // stack: remaining_size = dest_offset - offset, segment, kexit_info, dest_offset, offset, size
-    DUP1 DUP7
+    // stack: segment, context, kexit_info, dest_offset, offset, size
+    DUP5 DUP5 SUB
+    // stack: remaining_size = dest_offset - offset, segment, context, kexit_info, dest_offset, offset, size
+    DUP1 DUP8
     SUB // overlapping_size = size - remaining_size
-    // stack: overlapping_size, remaining_size, segment, kexit_info, dest_offset, offset, size
+    // stack: overlapping_size, remaining_size, segment, context, kexit_info, dest_offset, offset, size
 
     // Shift the initial offsets to copy the overlapping segment first.
-    DUP2 DUP7 ADD
-    // stack: offset_first_copy, overlapping_size, remaining_size, segment, kexit_info, dest_offset, offset, size
-    DUP3 DUP7 ADD
-    // stack: dest_offset_first_copy, offset_first_copy, overlapping_size, remaining_size, segment, kexit_info, dest_offset, offset, size
+    DUP2 DUP8 ADD
+    // stack: offset_first_copy, overlapping_size, remaining_size, segment, context, kexit_info, dest_offset, offset, size
+    DUP3 DUP8 ADD
+    // stack: dest_offset_first_copy, offset_first_copy, overlapping_size, remaining_size, segment, context, kexit_info, dest_offset, offset, size
+
+    %stack (dest_offset_first_copy, offset_first_copy, overlapping_size, remaining_size, segment, context, kexit_info, dest_offset, offset, size) ->
+        (context, segment, offset_first_copy, segment, dest_offset_first_copy, context, overlapping_size, wcopy_within_bounds, segment, context, kexit_info, dest_offset, offset, remaining_size)
+    %build_address // SRC
+    SWAP3
+    %build_address // DST
+    // stack: DST, SRC, overlapping_size, wcopy_within_bounds, segment, context, kexit_info, dest_offset, offset, remaining_size
+    %jump(memcpy_bytes)
+
+mcopy_empty:
+    // kexit_info, dest_offset, offset, size
+    %stack (kexit_info, dest_offset, offset, size) -> (kexit_info)
+    EXIT_KERNEL
+
+
+// The internal logic is similar to wcopy, but handles range overflow differently.
+// It is used for both CODECOPY and EXTCODECOPY.
+%macro codecopy_after_checks(segment)
+    // stack: total_size, src_ctx, kexit_info, dest_offset, offset, size
+    DUP1 DUP6
+    // stack: offset, total_size, total_size, src_ctx, kexit_info, dest_offset, offset, size
+    GT %jumpi(codecopy_large_offset)
+
+    PUSH $segment SWAP1
+    // stack: total_size, segment, src_ctx, kexit_info, dest_offset, offset, size
+    DUP1 DUP8 DUP8 ADD
+    // stack: offset + size, total_size, total_size, segment, src_ctx, kexit_info, dest_offset, offset, size
+    LT %jumpi(codecopy_within_bounds)
+
+    // stack: total_size, segment, src_ctx, kexit_info, dest_offset, offset, size
+    DUP7 DUP7 ADD
+    // stack: offset + size, total_size, segment, src_ctx, kexit_info, dest_offset, offset, size
+    SUB // extra_size = offset + size - total_size
+    // stack: extra_size, segment, src_ctx, kexit_info, dest_offset, offset, size
+    DUP1 DUP8 SUB
+    // stack: copy_size = size - extra_size, extra_size, segment, src_ctx, kexit_info, dest_offset, offset, size
+
+    // Compute the new dest_offset after actual copies, at which we will start padding with zeroes.
+    DUP1 DUP7 ADD
+    // stack: new_dest_offset, copy_size, extra_size, segment, src_ctx, kexit_info, dest_offset, offset, size
 
     GET_CONTEXT
-    %stack (context, dest_offset_first_copy, offset_first_copy, overlapping_size, remaining_size, segment, kexit_info, dest_offset, offset, size) ->
-        (context, segment, dest_offset_first_copy, context, segment, offset_first_copy, overlapping_size, wcopy_within_bounds, segment, kexit_info, dest_offset, offset, remaining_size)
+    %stack (context, new_dest_offset, copy_size, extra_size, segment, src_ctx, kexit_info, dest_offset, offset, size) ->
+        (src_ctx, segment, offset, @SEGMENT_MAIN_MEMORY, dest_offset, context, copy_size, wcopy_large_offset, kexit_info, new_dest_offset, offset, extra_size)
+    %build_address
+    SWAP3 %build_address
+    // stack: DST, SRC, copy_size, wcopy_large_offset, kexit_info, new_dest_offset, offset, extra_size
     %jump(memcpy_bytes)
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/memory/txn_fields.asm b/evm/src/cpu/kernel/asm/memory/txn_fields.asm
index e4e6b87544..a8c1c0788f 100644
--- a/evm/src/cpu/kernel/asm/memory/txn_fields.asm
+++ b/evm/src/cpu/kernel/asm/memory/txn_fields.asm
@@ -1,18 +1,27 @@
 // Load the given normalized transaction field from memory.
 %macro mload_txn_field(field)
+    // Transaction fields are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     // stack: (empty)
     PUSH $field
-    // stack: offset
-    %mload_kernel(@SEGMENT_NORMALIZED_TXN)
+    // stack: addr
+    MLOAD_GENERAL
     // stack: value
 %endmacro
 
 // Store the given normalized transaction field to memory.
 %macro mstore_txn_field(field)
+    // Transaction fields are already scaled by their corresponding segment,
+    // effectively making them the direct memory position to read from /
+    // write to.
+
     // stack: value
     PUSH $field
-    // stack: offset, value
-    %mstore_kernel(@SEGMENT_NORMALIZED_TXN)
+    // stack: addr, value
+    SWAP1
+    MSTORE_GENERAL
     // stack: (empty)
 %endmacro
 
diff --git a/evm/src/cpu/kernel/asm/mpt/delete/delete_branch.asm b/evm/src/cpu/kernel/asm/mpt/delete/delete_branch.asm
index 775e4e11ed..64187ac83a 100644
--- a/evm/src/cpu/kernel/asm/mpt/delete/delete_branch.asm
+++ b/evm/src/cpu/kernel/asm/mpt/delete/delete_branch.asm
@@ -43,7 +43,10 @@ update_branch:
 // If it's one, transform the branch node into an leaf/extension node and return it.
 maybe_normalize_branch:
     // stack: updated_child_ptr, first_nibble, node_payload_ptr, retdest
-    PUSH 0 %mstore_kernel_general(0) PUSH 0 %mstore_kernel_general(1)
+    PUSH 0
+    PUSH @SEGMENT_KERNEL_GENERAL
+    MSTORE_32BYTES_2
+    POP
     // stack: updated_child_ptr, first_nibble, node_payload_ptr, retdest
     PUSH 0
 // Loop from i=0..16 excluding `first_nibble` and store the number of non-empty children in
@@ -61,16 +64,18 @@ loop_eq_first_nibble:
     %increment %jump(loop)
 loop_non_empty:
     // stack: i, updated_child_ptr, first_nibble, node_payload_ptr, retdest
-    %mload_kernel_general(0) %increment %mstore_kernel_general(0)
-    DUP1 %mstore_kernel_general(1)
+    %mload_kernel_no_offset(@SEGMENT_KERNEL_GENERAL) %increment %mstore_kernel_no_offset(@SEGMENT_KERNEL_GENERAL)
+    PUSH 1 PUSH @SEGMENT_KERNEL_GENERAL %build_kernel_address
+    DUP2
+    MSTORE_GENERAL
     %increment %jump(loop)
 loop_end:
     // stack: i, updated_child_ptr, first_nibble, node_payload_ptr, retdest
     POP
     // stack: updated_child_ptr, first_nibble, node_payload_ptr, retdest
     // If there's more than one non-empty child, simply update the branch node.
-    %mload_kernel_general(0) %gt_const(1) %jumpi(update_branch)
-    %mload_kernel_general(0) ISZERO %jumpi(panic) // This should never happen.
+    %mload_kernel_no_offset(@SEGMENT_KERNEL_GENERAL) %gt_const(1) %jumpi(update_branch)
+    %mload_kernel_no_offset(@SEGMENT_KERNEL_GENERAL) ISZERO %jumpi(panic) // This should never happen.
     // Otherwise, transform the branch node into a leaf/extension node.
     // stack: updated_child_ptr, first_nibble, node_payload_ptr, retdest
     %mload_kernel_general(1)
diff --git a/evm/src/cpu/kernel/asm/mpt/delete/delete_extension.asm b/evm/src/cpu/kernel/asm/mpt/delete/delete_extension.asm
index 149b971d76..0627fcba6a 100644
--- a/evm/src/cpu/kernel/asm/mpt/delete/delete_extension.asm
+++ b/evm/src/cpu/kernel/asm/mpt/delete/delete_extension.asm
@@ -37,18 +37,10 @@ after_mpt_delete_extension_branch:
     // stack: child_type, updated_child_node_ptr, node_payload_ptr, node_len, node_key, retdest
     POP
     // stack: updated_child_node_ptr, node_payload_ptr, node_len, node_key, retdest
-    SWAP1
-    // stack: extension_ptr, updated_child_node_ptr, node_len, node_key, retdest
-    PUSH @MPT_NODE_EXTENSION DUP2 %mstore_trie_data
-    // stack: extension_ptr, updated_child_node_ptr, node_len, node_key, retdest
-    DUP3 DUP2 %mstore_trie_data // Append node_len to our node
-    // stack: extension_ptr, updated_child_node_ptr, node_len, node_key, retdest
-    DUP4 DUP2 %mstore_trie_data // Append node_key to our node
-    // stack: extension_ptr, updated_child_node_ptr, node_len, node_key, retdest
-    SWAP1 DUP2 %mstore_trie_data // Append updated_child_node_ptr to our node
-    // stack: extension_ptr, node_len, node_key, retdest
+    DUP2 %add_const(2) %mstore_trie_data
+    // stack: node_payload_ptr, node_len, node_key, retdest
+    %decrement
     %stack (extension_ptr, node_len, node_key, retdest) -> (retdest, extension_ptr)
-    // stack: extension_ptr, retdest
     JUMP
 
 after_mpt_delete_extension_extension:
diff --git a/evm/src/cpu/kernel/asm/mpt/hash/hash.asm b/evm/src/cpu/kernel/asm/mpt/hash/hash.asm
index 4209f06c75..9acde9ce78 100644
--- a/evm/src/cpu/kernel/asm/mpt/hash/hash.asm
+++ b/evm/src/cpu/kernel/asm/mpt/hash/hash.asm
@@ -1,41 +1,44 @@
 // Computes the Merkle root of the given trie node.
 //
 // encode_value is a function which should take as input
-// - the position withing @SEGMENT_RLP_RAW to write to,
-// - the offset of a value within @SEGMENT_TRIE_DATA, and
-// - a return address.
+// - the position within @SEGMENT_RLP_RAW to write to,
+// - the offset of a value within @SEGMENT_TRIE_DATA,
+// - a return address, and
+// - the current length of @SEGMENT_TRIE_DATA
 // It should serialize the value, write it to @SEGMENT_RLP_RAW starting at the
-// given position, and return an updated position (the next unused offset).
+// given position, and return an updated position (the next unused offset) as well
+// as an updated length for @SEGMENT_TRIE_DATA.
 //
-// Pre stack: node_ptr, encode_value, retdest
-// Post stack: hash
+// Given the initial length of the `TrieData` segment, it also updates the length
+// for the current trie.
+//
+// Pre stack: node_ptr, encode_value, cur_len, retdest
+// Post stack: hash, new_len
 global mpt_hash:
-    // stack: node_ptr, encode_value, retdest
-    %stack (node_ptr, encode_value) -> (node_ptr, encode_value, mpt_hash_hash_if_rlp)
+    // stack: node_ptr, encode_value, cur_len, retdest
+    %stack (node_ptr, encode_value, cur_len) -> (node_ptr, encode_value, cur_len, mpt_hash_hash_if_rlp)
     %jump(encode_or_hash_node)
 mpt_hash_hash_if_rlp:
-    // stack: result, result_len, retdest
+    // stack: result, result_len, new_len, retdest
     // If result_len < 32, then we have an RLP blob, and we need to hash it.
     DUP2 %lt_const(32) %jumpi(mpt_hash_hash_rlp)
     // Otherwise, we already have a hash, so just return it.
-    // stack: result, result_len, retdest
-    %stack (result, result_len, retdest) -> (retdest, result)
+    // stack: result, result_len, new_len, retdest
+    %stack (result, result_len, new_len, retdest) -> (retdest, result, new_len)
     JUMP
 mpt_hash_hash_rlp:
-    // stack: result, result_len, retdest
-    %stack (result, result_len)
-        // context, segment, offset, value, len, retdest
-        -> (0, @SEGMENT_RLP_RAW, 0, result, result_len, mpt_hash_hash_rlp_after_unpacking)
+    // stack: result, result_len, new_len, retdest
+    %stack (result, result_len, new_len)
+        -> (@SEGMENT_RLP_RAW, result, result_len, mpt_hash_hash_rlp_after_unpacking, result_len, new_len)
+    // stack: addr, result, result_len, mpt_hash_hash_rlp_after_unpacking, result_len, new_len
     %jump(mstore_unpacking)
 mpt_hash_hash_rlp_after_unpacking:
-    // stack: result_len, retdest
-    PUSH 0 // offset
-    PUSH @SEGMENT_RLP_RAW // segment
-    PUSH 0 // context
-    // stack: result_addr: 3, result_len, retdest
+    // stack: result_addr, result_len, new_len, retdest
+    POP PUSH @SEGMENT_RLP_RAW // ctx == virt == 0
+    // stack: result_addr, result_len, new_len, retdest
     KECCAK_GENERAL
-    // stack: hash, retdest
-    SWAP1
+    // stack: hash, new_len, retdest
+    %stack(hash, new_len, retdest) -> (retdest, hash, new_len)
     JUMP
 
 // Given a trie node, return its RLP encoding if it is is less than 32 bytes,
@@ -44,14 +47,13 @@ mpt_hash_hash_rlp_after_unpacking:
 // The result is given as a (value, length) pair, where the length is given
 // in bytes.
 //
-// Pre stack: node_ptr, encode_value, retdest
-// Post stack: result, result_len
+// Pre stack: node_ptr, encode_value, cur_len, retdest
+// Post stack: result, result_len, cur_len
 global encode_or_hash_node:
-    // stack: node_ptr, encode_value, retdest
     DUP1 %mload_trie_data
 
     // Check if we're dealing with a concrete node, i.e. not a hash node.
-    // stack: node_type, node_ptr, encode_value, retdest
+    // stack: node_type, node_ptr, encode_value, cur_len, retdest
     DUP1
     PUSH @MPT_NODE_HASH
     SUB
@@ -59,51 +61,51 @@ global encode_or_hash_node:
 
     // If we got here, node_type == @MPT_NODE_HASH.
     // Load the hash and return (hash, 32).
-    // stack: node_type, node_ptr, encode_value, retdest
+    // stack: node_type, node_ptr, encode_value, cur_len, retdest
     POP
-    // stack: node_ptr, encode_value, retdest
+    
+    // stack: node_ptr, encode_value, cur_len, retdest
     %increment // Skip over node type prefix
-    // stack: hash_ptr, encode_value, retdest
+    // stack: hash_ptr, encode_value, cur_len, retdest
     %mload_trie_data
-    // stack: hash, encode_value, retdest
-    %stack (hash, encode_value, retdest) -> (retdest, hash, 32)
+    // stack: hash, encode_value, cur_len, retdest
+    // Update the length of the `TrieData` segment: there are only two 
+    // elements in a hash node.
+    SWAP2 %add_const(2)
+    %stack (cur_len, encode_value, hash, retdest) -> (retdest, hash, 32, cur_len)
     JUMP
 encode_or_hash_concrete_node:
-    %stack (node_type, node_ptr, encode_value) -> (node_type, node_ptr, encode_value, maybe_hash_node)
+    %stack (node_type, node_ptr, encode_value, cur_len) -> (node_type, node_ptr, encode_value, cur_len, maybe_hash_node)
     %jump(encode_node)
 maybe_hash_node:
-    // stack: result_ptr, result_len, retdest
+    // stack: result_addr, result_len, cur_len, retdest
     DUP2 %lt_const(32)
     %jumpi(pack_small_rlp)
 
     // result_len >= 32, so we hash the result.
-    // stack: result_ptr, result_len, retdest
-    PUSH @SEGMENT_RLP_RAW // segment
-    PUSH 0 // context
-    // stack: result_addr: 3, result_len, retdest
+    // stack: result_addr, result_len, cur_len, retdest
     KECCAK_GENERAL
-    %stack (hash, retdest) -> (retdest, hash, 32)
+    %stack (hash, cur_len, retdest) -> (retdest, hash, 32, cur_len)
     JUMP
 pack_small_rlp:
-    // stack: result_ptr, result_len, retdest
-    %stack (result_ptr, result_len)
-        -> (0, @SEGMENT_RLP_RAW, result_ptr, result_len,
-            after_packed_small_rlp, result_len)
-    %jump(mload_packing)
+    // stack: result_ptr, result_len, cur_len, retdest
+    %stack (result_ptr, result_len, cur_len)
+        -> (result_ptr, result_len, result_len, cur_len)
+    MLOAD_32BYTES
 after_packed_small_rlp:
-    %stack (result, result_len, retdest) -> (retdest, result, result_len)
+    %stack (result, result_len, cur_len, retdest) -> (retdest, result, result_len, cur_len)
     JUMP
 
 // RLP encode the given trie node, and return an (pointer, length) pair
 // indicating where the data lives within @SEGMENT_RLP_RAW.
 //
-// Pre stack: node_type, node_ptr, encode_value, retdest
-// Post stack: result_ptr, result_len
+// Pre stack: node_type, node_ptr, encode_value, cur_len, retdest
+// Post stack: result_ptr, result_len, cur_len
 encode_node:
-    // stack: node_type, node_ptr, encode_value, retdest
+    // stack: node_type, node_ptr, encode_value, cur_len, retdest
     // Increment node_ptr, so it points to the node payload instead of its type.
     SWAP1 %increment SWAP1
-    // stack: node_type, node_payload_ptr, encode_value, retdest
+    // stack: node_type, node_payload_ptr, encode_value, cur_len, retdest
 
     DUP1 %eq_const(@MPT_NODE_EMPTY)     %jumpi(encode_node_empty)
     DUP1 %eq_const(@MPT_NODE_BRANCH)    %jumpi(encode_node_branch)
@@ -115,193 +117,172 @@ encode_node:
     PANIC
 
 global encode_node_empty:
-    // stack: node_type, node_payload_ptr, encode_value, retdest
+    // stack: node_type, node_payload_ptr, encode_value, cur_len, retdest
     %pop3
-    // stack: retdest
-    // An empty node is encoded as a single byte, 0x80, which is the RLP encoding of the empty string.
-    // TODO: Write this byte just once to RLP memory, then we can always return (0, 1).
-    %alloc_rlp_block
-    // stack: rlp_pos, retdest
-    PUSH 0x80
-    // stack: 0x80, rlp_pos, retdest
-    DUP2
-    // stack: rlp_pos, 0x80, rlp_pos, retdest
-    %mstore_rlp
-    %stack (rlp_pos, retdest) -> (retdest, rlp_pos, 1)
+    %stack (cur_len, retdest) -> (retdest, @ENCODED_EMPTY_NODE_POS, 1, cur_len)
     JUMP
 
 global encode_node_branch:
-    // stack: node_type, node_payload_ptr, encode_value, retdest
+    // stack: node_type, node_payload_ptr, encode_value, cur_len, retdest
     POP
-    // stack: node_payload_ptr, encode_value, retdest
 
-    // Get the next unused offset within the encoded child buffers.
-    // Then immediately increment the next unused offset by 16, so any
-    // recursive calls will use nonoverlapping offsets.
-    // TODO: Allocate a block of RLP memory instead?
-    %mload_global_metadata(@GLOBAL_METADATA_TRIE_ENCODED_CHILD_SIZE)
-    DUP1 %add_const(16)
-    %mstore_global_metadata(@GLOBAL_METADATA_TRIE_ENCODED_CHILD_SIZE)
-    // stack: base_offset, node_payload_ptr, encode_value, retdest
+    // `TrieData` stores the node type, 16 children pointers, and a value pointer.
+    SWAP2 %add_const(18) SWAP2
+    // stack: node_payload_ptr, encode_value, cur_len, retdest
+
+    // Allocate a block of RLP memory
+    %alloc_rlp_block DUP1
+    // stack: rlp_pos, rlp_start, node_payload_ptr, encode_value, cur_len retdest
 
-    // We will call encode_or_hash_node on each child. For the i'th child, we
-    // will store the result in SEGMENT_TRIE_ENCODED_CHILD[base + i], and its length in
-    // SEGMENT_TRIE_ENCODED_CHILD_LEN[base + i].
+    // Call encode_or_hash_node on each child 
     %encode_child(0)  %encode_child(1)  %encode_child(2)  %encode_child(3)
     %encode_child(4)  %encode_child(5)  %encode_child(6)  %encode_child(7)
     %encode_child(8)  %encode_child(9)  %encode_child(10) %encode_child(11)
     %encode_child(12) %encode_child(13) %encode_child(14) %encode_child(15)
-    // stack: base_offset, node_payload_ptr, encode_value, retdest
 
-    // Now, append each child to our RLP tape.
-    %alloc_rlp_block DUP1
-    // stack: rlp_pos, rlp_start, base_offset, node_payload_ptr, encode_value, retdest
-    %append_child(0)  %append_child(1)  %append_child(2)  %append_child(3)
-    %append_child(4)  %append_child(5)  %append_child(6)  %append_child(7)
-    %append_child(8)  %append_child(9)  %append_child(10) %append_child(11)
-    %append_child(12) %append_child(13) %append_child(14) %append_child(15)
-    // stack: rlp_pos', rlp_start, base_offset, node_payload_ptr, encode_value, retdest
+    // stack: rlp_pos', rlp_start, node_payload_ptr, encode_value, cur_len, retdest
 
-    %stack (rlp_pos, rlp_start, base_offset, node_payload_ptr)
+    %stack (rlp_pos, rlp_start, node_payload_ptr)
         -> (node_payload_ptr, rlp_pos, rlp_start)
     %add_const(16)
-    // stack: value_ptr_ptr, rlp_pos', rlp_start, encode_value, retdest
+    // stack: value_ptr_ptr, rlp_pos', rlp_start, encode_value, cur_len, retdest
     %mload_trie_data
-    // stack: value_ptr, rlp_pos', rlp_start, encode_value, retdest
+    // stack: value_ptr, rlp_pos', rlp_start, encode_value, cur_len, retdest
     DUP1 %jumpi(encode_node_branch_with_value)
 
     // No value; append the empty string (0x80).
-    // stack: value_ptr, rlp_pos', rlp_start, encode_value, retdest
-    %stack (value_ptr, rlp_pos, rlp_start, encode_value) -> (rlp_pos, 0x80, rlp_pos, rlp_start)
-    %mstore_rlp
-    // stack: rlp_pos', rlp_start, retdest
+    // stack: value_ptr, rlp_pos', rlp_start, encode_value, cur_len, retdest
+    %stack (value_ptr, rlp_pos, rlp_start, encode_value) -> (0x80, rlp_pos, rlp_pos, rlp_start)
+    MSTORE_GENERAL
+    // stack: rlp_pos', rlp_start, cur_len, retdest
     %increment
-    // stack: rlp_pos'', rlp_start, retdest
+    // stack: rlp_pos'', rlp_start, cur_len, retdest
     %jump(encode_node_branch_prepend_prefix)
 encode_node_branch_with_value:
-    // stack: value_ptr, rlp_pos', rlp_start, encode_value, retdest
-    %stack (value_ptr, rlp_pos, rlp_start, encode_value)
-        -> (encode_value, rlp_pos, value_ptr, encode_node_branch_prepend_prefix, rlp_start)
+    // stack: value_ptr, rlp_pos', rlp_start, encode_value, cur_len, retdest
+    %stack (value_ptr, rlp_pos, rlp_start, encode_value, cur_len)
+        -> (encode_value, rlp_pos, value_ptr, cur_len, encode_node_branch_after_value, rlp_start)
     JUMP // call encode_value
+encode_node_branch_after_value:
+    // stack: rlp_pos'', cur_len, rlp_start, retdest
+    %stack(rlp_pos, cur_len, rlp_start, retdest) -> (rlp_pos, rlp_start, cur_len, retdest)
 encode_node_branch_prepend_prefix:
-    // stack: rlp_pos'', rlp_start, retdest
+    // stack: rlp_pos'', rlp_start, cur_len, retdest
     %prepend_rlp_list_prefix
-    // stack: rlp_prefix_start, rlp_len, retdest
-    %stack (rlp_prefix_start, rlp_len, retdest)
-        -> (retdest, rlp_prefix_start, rlp_len)
+    // stack: rlp_prefix_start, rlp_len, cur_len, retdest
+    %stack (rlp_prefix_start, rlp_len, cur_len, retdest)
+        -> (retdest, rlp_prefix_start, rlp_len, cur_len)
     JUMP
 
+
 // Part of the encode_node_branch function. Encodes the i'th child.
-// Stores the result in SEGMENT_TRIE_ENCODED_CHILD[base + i], and its length in
-// SEGMENT_TRIE_ENCODED_CHILD_LEN[base + i].
 %macro encode_child(i)
-    // stack: base_offset, node_payload_ptr, encode_value, retdest
+    // stack: rlp_pos, rlp_start, node_payload_ptr, encode_value, cur_len, retdest
     PUSH %%after_encode
-    DUP4 DUP4
-    // stack: node_payload_ptr, encode_value, %%after_encode, base_offset, node_payload_ptr, encode_value, retdest
+    DUP6 DUP6 DUP6
+    // stack: node_payload_ptr, encode_value, cur_len, %%after_encode, rlp_pos, rlp_start, node_payload_ptr, encode_value, cur_len, retdest
     %add_const($i) %mload_trie_data
-    // stack: child_i_ptr, encode_value, %%after_encode, base_offset, node_payload_ptr, encode_value, retdest
+    // stack: child_i_ptr, encode_value, cur_len, %%after_encode, rlp_pos, rlp_start, node_payload_ptr, encode_value, cur_len, retdest
     %jump(encode_or_hash_node)
 %%after_encode:
-    // stack: result, result_len, base_offset, node_payload_ptr, encode_value, retdest
-    DUP3 %add_const($i) %mstore_kernel(@SEGMENT_TRIE_ENCODED_CHILD)
-    // stack: result_len, base_offset, node_payload_ptr, encode_value, retdest
-    DUP2 %add_const($i) %mstore_kernel(@SEGMENT_TRIE_ENCODED_CHILD_LEN)
-    // stack: base_offset, node_payload_ptr, encode_value, retdest
-%endmacro
-
-// Part of the encode_node_branch function. Appends the i'th child's RLP.
-%macro append_child(i)
-    // stack: rlp_pos, rlp_start, base_offset, node_payload_ptr, encode_value, retdest
-    DUP3 %add_const($i) %mload_kernel(@SEGMENT_TRIE_ENCODED_CHILD) // load result
-    DUP4 %add_const($i) %mload_kernel(@SEGMENT_TRIE_ENCODED_CHILD_LEN) // load result_len
-    // stack: result_len, result, rlp_pos, rlp_start, base_offset, node_payload_ptr, encode_value, retdest
+    // stack: result, result_len, cur_len, rlp_pos, rlp_start, node_payload_ptr, encode_value, old_len, retdest
     // If result_len != 32, result is raw RLP, with an appropriate RLP prefix already.
-    DUP1 %sub_const(32) %jumpi(%%unpack)
+    SWAP1 
+    PUSH 32 DUP2 SUB
+    %jumpi(%%unpack)
     // Otherwise, result is a hash, and we need to add the prefix 0x80 + 32 = 160.
-    // stack: result_len, result, rlp_pos, rlp_start, base_offset, node_payload_ptr, encode_value, retdest
-    PUSH 160
+    // stack: result_len, result, cur_len, rlp_pos, rlp_start, node_payload_ptr, encode_value, old_len, retdest
     DUP4 // rlp_pos
-    %mstore_rlp
-    SWAP2 %increment SWAP2 // rlp_pos += 1
+    PUSH 160
+    MSTORE_GENERAL
+    SWAP3 %increment SWAP3 // rlp_pos += 1
 %%unpack:
-    %stack (result_len, result, rlp_pos, rlp_start, base_offset, node_payload_ptr, encode_value, retdest)
+    %stack (result_len, result, cur_len, rlp_pos, rlp_start, node_payload_ptr, encode_value, old_len, retdest)
         -> (rlp_pos, result, result_len, %%after_unpacking,
-            rlp_start, base_offset, node_payload_ptr, encode_value, retdest)
-    %jump(mstore_unpacking_rlp)
+            rlp_start, node_payload_ptr, encode_value, cur_len, retdest)
+    %jump(mstore_unpacking)
 %%after_unpacking:
-    // stack: rlp_pos', rlp_start, base_offset, node_payload_ptr, encode_value, retdest
+    // stack: rlp_pos', rlp_start, node_payload_ptr, encode_value, cur_len, retdest
 %endmacro
 
 global encode_node_extension:
-    // stack: node_type, node_payload_ptr, encode_value, retdest
-    %stack (node_type, node_payload_ptr, encode_value)
-        -> (node_payload_ptr, encode_value, encode_node_extension_after_encode_child, node_payload_ptr)
+    // stack: node_type, node_payload_ptr, encode_value, cur_len, retdest
+    SWAP3 %add_const(4) SWAP3
+    %stack (node_type, node_payload_ptr, encode_value, cur_len)
+        -> (node_payload_ptr, encode_value, cur_len, encode_node_extension_after_encode_child, node_payload_ptr)
     %add_const(2) %mload_trie_data
-    // stack: child_ptr, encode_value, encode_node_extension_after_encode_child, node_payload_ptr, retdest
+    // stack: child_ptr, encode_value, cur_len, encode_node_extension_after_encode_child, node_payload_ptr, retdest
     %jump(encode_or_hash_node)
 encode_node_extension_after_encode_child:
-    // stack: result, result_len, node_payload_ptr, retdest
+    // stack: result, result_len, cur_len, node_payload_ptr, retdest
+    %stack (result, result_len, cur_len, node_payload_ptr) -> (result, result_len, node_payload_ptr, cur_len)
     %alloc_rlp_block
-    // stack: rlp_start, result, result_len, node_payload_ptr, retdest
+    // stack: rlp_start, result, result_len, node_payload_ptr, cur_len, retdest
     PUSH encode_node_extension_after_hex_prefix // retdest
     PUSH 0 // terminated
-    // stack: terminated, encode_node_extension_after_hex_prefix, rlp_start, result, result_len, node_payload_ptr, retdest
+    // stack: terminated, encode_node_extension_after_hex_prefix, rlp_start, result, result_len, node_payload_ptr, cur_len, retdest
     DUP6 %increment %mload_trie_data // Load the packed_nibbles field, which is at index 1.
-    // stack: packed_nibbles, terminated, encode_node_extension_after_hex_prefix, rlp_start, result, result_len, node_payload_ptr, retdest
+    // stack: packed_nibbles, terminated, encode_node_extension_after_hex_prefix, rlp_start, result, result_len, node_payload_ptr, cur_len, retdest
     DUP7 %mload_trie_data // Load the num_nibbles field, which is at index 0.
-    // stack: num_nibbles, packed_nibbles, terminated, encode_node_extension_after_hex_prefix, rlp_start, result, result_len, node_payload_ptr, retdest
+    // stack: num_nibbles, packed_nibbles, terminated, encode_node_extension_after_hex_prefix, rlp_start, result, result_len, node_payload_ptr, cur_len, retdest
     DUP5
-    // stack: rlp_start, num_nibbles, packed_nibbles, terminated, encode_node_extension_after_hex_prefix, rlp_start, result, result_len, node_payload_ptr, retdest
+    // stack: rlp_start, num_nibbles, packed_nibbles, terminated, encode_node_extension_after_hex_prefix, rlp_start, result, result_len, node_payload_ptr, cur_len, retdest
     %jump(hex_prefix_rlp)
 encode_node_extension_after_hex_prefix:
-    // stack: rlp_pos, rlp_start, result, result_len, node_payload_ptr, retdest
+    // stack: rlp_pos, rlp_start, result, result_len, node_payload_ptr, cur_len, retdest
     // If result_len != 32, result is raw RLP, with an appropriate RLP prefix already.
-    DUP4 %sub_const(32) %jumpi(encode_node_extension_unpack)
+    PUSH 32 DUP5 SUB
+    %jumpi(encode_node_extension_unpack)
     // Otherwise, result is a hash, and we need to add the prefix 0x80 + 32 = 160.
+    DUP1 // rlp_pos
     PUSH 160
-    DUP2 // rlp_pos
-    %mstore_rlp
+    MSTORE_GENERAL
     %increment // rlp_pos += 1
 encode_node_extension_unpack:
-    %stack (rlp_pos, rlp_start, result, result_len, node_payload_ptr)
-        -> (rlp_pos, result, result_len, encode_node_extension_after_unpacking, rlp_start)
-    %jump(mstore_unpacking_rlp)
+    %stack (rlp_pos, rlp_start, result, result_len, node_payload_ptr, cur_len)
+        -> (rlp_pos, result, result_len, encode_node_extension_after_unpacking, rlp_start, cur_len)
+    %jump(mstore_unpacking)
 encode_node_extension_after_unpacking:
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_pos, rlp_start, cur_len, retdest
     %prepend_rlp_list_prefix
-    %stack (rlp_prefix_start_pos, rlp_len, retdest)
-        -> (retdest, rlp_prefix_start_pos, rlp_len)
+    %stack (rlp_prefix_start_pos, rlp_len, cur_len, retdest)
+        -> (retdest, rlp_prefix_start_pos, rlp_len, cur_len)
     JUMP
 
 global encode_node_leaf:
-    // stack: node_type, node_payload_ptr, encode_value, retdest
+    // stack: node_type, node_payload_ptr, encode_value, cur_len, retdest
     POP
-    // stack: node_payload_ptr, encode_value, retdest
+    // stack: node_payload_ptr, encode_value, cur_len, retdest
     %alloc_rlp_block
     PUSH encode_node_leaf_after_hex_prefix // retdest
     PUSH 1 // terminated
-    // stack: terminated, encode_node_leaf_after_hex_prefix, rlp_start, node_payload_ptr, encode_value, retdest
+    // stack: terminated, encode_node_leaf_after_hex_prefix, rlp_start, node_payload_ptr, encode_value, cur_len, retdest
     DUP4 %increment %mload_trie_data // Load the packed_nibbles field, which is at index 1.
-    // stack: packed_nibbles, terminated, encode_node_leaf_after_hex_prefix, rlp_start, node_payload_ptr, encode_value, retdest
+    // stack: packed_nibbles, terminated, encode_node_leaf_after_hex_prefix, rlp_start, node_payload_ptr, encode_value, cur_len, retdest
     DUP5 %mload_trie_data // Load the num_nibbles field, which is at index 0.
-    // stack: num_nibbles, packed_nibbles, terminated, encode_node_leaf_after_hex_prefix, rlp_start, node_payload_ptr, encode_value, retdest
+    // stack: num_nibbles, packed_nibbles, terminated, encode_node_leaf_after_hex_prefix, rlp_start, node_payload_ptr, encode_value, cur_len, retdest
     DUP5
-    // stack: rlp_start, num_nibbles, packed_nibbles, terminated, encode_node_leaf_after_hex_prefix, rlp_start, node_payload_ptr, encode_value, retdest
+    // stack: rlp_start, num_nibbles, packed_nibbles, terminated, encode_node_leaf_after_hex_prefix, rlp_start, node_payload_ptr, encode_value, cur_len, retdest
     %jump(hex_prefix_rlp)
 encode_node_leaf_after_hex_prefix:
-    // stack: rlp_pos, rlp_start, node_payload_ptr, encode_value, retdest
+    // stack: rlp_pos, rlp_start, node_payload_ptr, encode_value, cur_len, retdest
     SWAP2
     %add_const(2) // The value pointer starts at index 3, after num_nibbles and packed_nibbles.
-    // stack: value_ptr_ptr, rlp_start, rlp_pos, encode_value, retdest
+    // stack: value_ptr_ptr, rlp_start, rlp_pos, encode_value, cur_len, retdest
     %mload_trie_data
-    // stack: value_ptr, rlp_start, rlp_pos, encode_value, retdest
-    %stack (value_ptr, rlp_start, rlp_pos, encode_value, retdest)
-        -> (encode_value, rlp_pos, value_ptr, encode_node_leaf_after_encode_value, rlp_start, retdest)
+    // stack: value_ptr, rlp_start, rlp_pos, encode_value, cur_len, retdest
+    %stack (value_ptr, rlp_start, rlp_pos, encode_value, cur_len, retdest)
+        -> (encode_value, rlp_pos, value_ptr, cur_len, encode_node_leaf_after_encode_value, rlp_start, retdest)
     JUMP
 encode_node_leaf_after_encode_value:
-    // stack: rlp_end_pos, rlp_start, retdest
+    // stack: rlp_end_pos, cur_len, rlp_start, retdest
+    // `TrieData` holds the node type, the number of nibbles, the nibbles,
+    // the pointer to the value and the value.
+    // We add 4 for the node type, the number of nibbles, the nibbles
+    // and the pointer to the value.
+    SWAP1 %add_const(4)
+    %stack(cur_len, rlp_end_pos, rlp_start, retdest) -> (rlp_end_pos, rlp_start, cur_len, retdest)
     %prepend_rlp_list_prefix
-    %stack (rlp_prefix_start_pos, rlp_len, retdest)
-        -> (retdest, rlp_prefix_start_pos, rlp_len)
+    %stack (rlp_prefix_start_pos, rlp_len, cur_len, retdest)
+        -> (retdest, rlp_prefix_start_pos, rlp_len, cur_len)
     JUMP
diff --git a/evm/src/cpu/kernel/asm/mpt/hash/hash_trie_specific.asm b/evm/src/cpu/kernel/asm/mpt/hash/hash_trie_specific.asm
index 767927fbc6..cd07c01fdc 100644
--- a/evm/src/cpu/kernel/asm/mpt/hash/hash_trie_specific.asm
+++ b/evm/src/cpu/kernel/asm/mpt/hash/hash_trie_specific.asm
@@ -1,299 +1,355 @@
 // Hashing logic specific to a particular trie.
 
 global mpt_hash_state_trie:
-    // stack: retdest
+    // stack: cur_len, retdest
     PUSH encode_account
     %mload_global_metadata(@GLOBAL_METADATA_STATE_TRIE_ROOT)
-    // stack: node_ptr, encode_account, retdest
+    // stack: node_ptr, encode_account, cur_len, retdest
     %jump(mpt_hash)
 
 %macro mpt_hash_state_trie
+    // stack: cur_len
     PUSH %%after
+    SWAP1
     %jump(mpt_hash_state_trie)
 %%after:
 %endmacro
 
 global mpt_hash_storage_trie:
-    // stack: node_ptr, retdest
-    %stack (node_ptr) -> (node_ptr, encode_storage_value)
+    // stack: node_ptr, cur_len, retdest
+    %stack (node_ptr, cur_len) -> (node_ptr, encode_storage_value, cur_len)
     %jump(mpt_hash)
 
 %macro mpt_hash_storage_trie
-    %stack (node_ptr) -> (node_ptr, %%after)
+    %stack (node_ptr, cur_len) -> (node_ptr, cur_len, %%after)
     %jump(mpt_hash_storage_trie)
 %%after:
 %endmacro
 
 global mpt_hash_txn_trie:
-    // stack: retdest
+    // stack: cur_len, retdest
     PUSH encode_txn
     %mload_global_metadata(@GLOBAL_METADATA_TXN_TRIE_ROOT)
-    // stack: node_ptr, encode_txn, retdest
+    // stack: node_ptr, encode_txn, cur_len, retdest
     %jump(mpt_hash)
 
 %macro mpt_hash_txn_trie
+    // stack: cur_len
     PUSH %%after
+    SWAP1
     %jump(mpt_hash_txn_trie)
 %%after:
 %endmacro
 
 global mpt_hash_receipt_trie:
-    // stack: retdest
+    // stack: cur_len, retdest
     PUSH encode_receipt
     %mload_global_metadata(@GLOBAL_METADATA_RECEIPT_TRIE_ROOT)
-    // stack: node_ptr, encode_receipt, retdest
+    // stack: node_ptr, encode_receipt, cur_len, retdest
     %jump(mpt_hash)
 
 %macro mpt_hash_receipt_trie
+    // stack: cur_len
     PUSH %%after
+    SWAP1
     %jump(mpt_hash_receipt_trie)
 %%after:
 %endmacro
 
 global encode_account:
-    // stack: rlp_pos, value_ptr, retdest
+    // stack: rlp_addr, value_ptr, cur_len, retdest
     // First, we compute the length of the RLP data we're about to write.
+    // We also update the length of the trie data segment.
     // The nonce and balance fields are variable-length, so we need to load them
     // to determine their contribution, while the other two fields are fixed
     // 32-bytes integers.
+
+    // First, we add 4 to the trie data length, for the nonce,
+    // the balance, the storage pointer and the code hash.
+    SWAP2 %add_const(4) SWAP2
+
+    // Now, we start the encoding.
+    // stack: rlp_addr, value_ptr, cur_len, retdest
     DUP2 %mload_trie_data // nonce = value[0]
     %rlp_scalar_len
-    // stack: nonce_rlp_len, rlp_pos, value_ptr, retdest
+    // stack: nonce_rlp_len, rlp_addr, value_ptr, cur_len, retdest
     DUP3 %increment %mload_trie_data // balance = value[1]
     %rlp_scalar_len
-    // stack: balance_rlp_len, nonce_rlp_len, rlp_pos, value_ptr, retdest
+    // stack: balance_rlp_len, nonce_rlp_len, rlp_addr, value_ptr, cur_len, retdest
     PUSH 66 // storage_root and code_hash fields each take 1 + 32 bytes
     ADD ADD
-    // stack: payload_len, rlp_pos, value_ptr, retdest
+    // stack: payload_len, rlp_addr, value_ptr, cur_len, retdest
     SWAP1
-    // stack: rlp_pos, payload_len, value_ptr, retdest
+    // stack: rlp_addr, payload_len, value_ptr, cur_len, retdest
     DUP2 %rlp_list_len
-    // stack: list_len, rlp_pos, payload_len, value_ptr, retdest
+    // stack: list_len, rlp_addr, payload_len, value_ptr, cur_len, retdest
     SWAP1
-    // stack: rlp_pos, list_len, payload_len, value_ptr, retdest
+    // stack: rlp_addr, list_len, payload_len, value_ptr, cur_len, retdest
     %encode_rlp_multi_byte_string_prefix
-    // stack: rlp_pos_2, payload_len, value_ptr, retdest
+    // stack: rlp_pos_2, payload_len, value_ptr, cur_len, retdest
     %encode_rlp_list_prefix
-    // stack: rlp_pos_3, value_ptr, retdest
+    // stack: rlp_pos_3, value_ptr, cur_len, retdest
     DUP2 %mload_trie_data // nonce = value[0]
-    // stack: nonce, rlp_pos_3, value_ptr, retdest
+    // stack: nonce, rlp_pos_3, value_ptr, cur_len, retdest
     SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos_4, value_ptr, retdest
+    // stack: rlp_pos_4, value_ptr, cur_len, retdest
     DUP2 %increment %mload_trie_data // balance = value[1]
-    // stack: balance, rlp_pos_4, value_ptr, retdest
+    // stack: balance, rlp_pos_4, value_ptr, cur_len, retdest
     SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos_5, value_ptr, retdest
-    DUP2 %add_const(2) %mload_trie_data // storage_root_ptr = value[2]
-    // stack: storage_root_ptr, rlp_pos_5, value_ptr, retdest
+    // stack: rlp_pos_5, value_ptr, cur_len, retdest
+    DUP3
+    DUP3 %add_const(2) %mload_trie_data // storage_root_ptr = value[2]
+    // stack: storage_root_ptr, cur_len, rlp_pos_5, value_ptr, cur_len, retdest
+
+
+    PUSH debug_after_hash_storage_trie
+    POP
+
+    // Hash storage trie.
     %mpt_hash_storage_trie
-    // stack: storage_root_digest, rlp_pos_5, value_ptr, retdest
-    SWAP1 %encode_rlp_256
-    // stack: rlp_pos_6, value_ptr, retdest
+    // stack: storage_root_digest, new_len, rlp_pos_5, value_ptr, cur_len, retdest
+    %stack(storage_root_digest, new_len, rlp_pos_five, value_ptr, cur_len) -> (rlp_pos_five, storage_root_digest, value_ptr, new_len)
+    %encode_rlp_256
+    // stack: rlp_pos_6, value_ptr, new_len, retdest
     SWAP1 %add_const(3) %mload_trie_data // code_hash = value[3]
-    // stack: code_hash, rlp_pos_6, retdest
+    // stack: code_hash, rlp_pos_6, new_len, retdest
     SWAP1 %encode_rlp_256
-    // stack: rlp_pos_7, retdest
-    SWAP1
+    // stack: rlp_pos_7, new_len, retdest
+    %stack(rlp_pos_7, new_len, retdest) -> (retdest, rlp_pos_7, new_len)
     JUMP
 
 global encode_txn:
-    // stack: rlp_pos, value_ptr, retdest
+    // stack: rlp_addr, value_ptr, cur_len, retdest
     
-    // Load the txn_rlp_len which is at the beginnig of value_ptr
+    // Load the txn_rlp_len which is at the beginning of value_ptr
     DUP2 %mload_trie_data
-    // stack: txn_rlp_len, rlp_pos, value_ptr, retdest
+    // stack: txn_rlp_len, rlp_addr, value_ptr, cur_len, retdest
+    // We need to add 1+txn_rlp_len to the length of the trie data.
+    SWAP3 DUP4 %increment ADD
+    // stack: new_len, rlp_addr, value_ptr, txn_rlp_len, retdest
+    SWAP3
     SWAP2 %increment
-    // stack: txn_rlp_ptr=value_ptr+1, rlp_pos, txn_rlp_len, retdest
+    // stack: txn_rlp_ptr=value_ptr+1, rlp_addr, txn_rlp_len, new_len, retdest
 
-    %stack (txn_rlp_ptr, rlp_pos, txn_rlp_len) -> (rlp_pos, txn_rlp_len, txn_rlp_len, txn_rlp_ptr)
+    %stack (txn_rlp_ptr, rlp_addr, txn_rlp_len) -> (rlp_addr, txn_rlp_len, txn_rlp_len, txn_rlp_ptr)
     // Encode the txn rlp prefix
-    // stack: rlp_pos, txn_rlp_len, txn_rlp_len, txn_rlp_ptr, retdest
+    // stack: rlp_addr, txn_rlp_len, txn_rlp_len, txn_rlp_ptr, cur_len, retdest
     %encode_rlp_multi_byte_string_prefix
     // copy txn_rlp to the new block
-    // stack: rlp_pos, txn_rlp_len, txn_rlp_ptr, retdest
-    %stack (rlp_pos, txn_rlp_len, txn_rlp_ptr) -> (
-        0, @SEGMENT_RLP_RAW, rlp_pos, // dest addr
-        0, @SEGMENT_TRIE_DATA, txn_rlp_ptr, // src addr. Kernel has context 0
+    // stack: rlp_addr, txn_rlp_len, txn_rlp_ptr, new_len, retdest
+    %stack (rlp_addr, txn_rlp_len, txn_rlp_ptr) -> (
+        @SEGMENT_TRIE_DATA, txn_rlp_ptr, // src addr. Kernel has context 0
+        rlp_addr, // dest addr
         txn_rlp_len, // mcpy len
-        txn_rlp_len, rlp_pos)
+        txn_rlp_len, rlp_addr)
+    %build_kernel_address
+    SWAP1
+    // stack: DST, SRC, txn_rlp_len, txn_rlp_len, rlp_addr, new_len, retdest
     %memcpy_bytes
     ADD
-    // stack new_rlp_pos, retdest
-    SWAP1
+    // stack new_rlp_addr, new_len, retdest
+    %stack(new_rlp_addr, new_len, retdest) -> (retdest, new_rlp_addr, new_len)
     JUMP
 
 // We assume a receipt in memory is stored as:
 // [payload_len, status, cum_gas_used, bloom, logs_payload_len, num_logs, [logs]].
 // A log is [payload_len, address, num_topics, [topics], data_len, [data]].
 global encode_receipt:
-    // stack: rlp_pos, value_ptr, retdest
-    // There is a double encoding! What we compute is:
-    // either RLP(RLP(receipt)) for Legacy transactions or RLP(txn_type||RLP(receipt)) for transactions of type 1 or 2.
+    // stack: rlp_addr, value_ptr, cur_len, retdest
+    // First, we add 261 to the trie data length for all values before the logs besides the type.
+    // These are: the payload length, the status, cum_gas_used, the bloom filter (256 elements),
+    // the length of the logs payload and the length of the logs.
+    SWAP2 %add_const(261) SWAP2
+    // There is a double encoding!
+    // What we compute is:
+    //  - either RLP(RLP(receipt)) for Legacy transactions
+    //  - or RLP(txn_type||RLP(receipt)) for transactions of type 1 or 2.
     // First encode the wrapper prefix.
     DUP2 %mload_trie_data
-    // stack: first_value, rlp_pos, value_ptr, retdest
+    // stack: first_value, rlp_addr, value_ptr, cur_len, retdest
     // The first value is either the transaction type or the payload length.
     // Since the receipt contains at least the 256-bytes long bloom filter, payload_len > 3.
     DUP1 %lt_const(3) %jumpi(encode_nonzero_receipt_type)
     // If we are here, then the first byte is the payload length.
     %rlp_list_len
-    // stack: rlp_receipt_len, rlp_pos, value_ptr, retdest
+    // stack: rlp_receipt_len, rlp_addr, value_ptr, cur_len, retdest
     SWAP1 %encode_rlp_multi_byte_string_prefix
-    // stack: rlp_pos, value_ptr, retdest
+    // stack: rlp_addr, value_ptr, cur_len, retdest
 
 encode_receipt_after_type:
-    // stack: rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, payload_len_ptr, cur_len, retdest
     // Then encode the receipt prefix.
     // `payload_ptr` is either `value_ptr` or `value_ptr+1`, depending on the transaction type.
     DUP2 %mload_trie_data
-    // stack: payload_len, rlp_pos, payload_len_ptr, retdest
+    // stack: payload_len, rlp_addr, payload_len_ptr, cur_len, retdest
     SWAP1 %encode_rlp_list_prefix 
-    // stack: rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, payload_len_ptr, cur_len, retdest
     // Encode status.
     DUP2 %increment %mload_trie_data
-    // stack: status, rlp_pos, payload_len_ptr, retdest
+    // stack: status, rlp_addr, payload_len_ptr, cur_len, retdest
     SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, payload_len_ptr, cur_len, retdest
     // Encode cum_gas_used.
     DUP2 %add_const(2) %mload_trie_data
-    // stack: cum_gas_used, rlp_pos, payload_len_ptr, retdest
+    // stack: cum_gas_used, rlp_addr, payload_len_ptr, cur_len, retdest
     SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, payload_len_ptr, cur_len, retdest
     // Encode bloom.
     PUSH 256 // Bloom length.
-    DUP3 %add_const(3) PUSH @SEGMENT_TRIE_DATA PUSH 0 // MPT src address.
-    DUP5
-    // stack: rlp_pos, SRC, 256, rlp_pos, payload_len_ptr, retdest
+    DUP3 %add_const(3) PUSH @SEGMENT_TRIE_DATA %build_kernel_address // MPT src address.
+    DUP3
+    // stack: rlp_addr, SRC, 256, rlp_addr, payload_len_ptr, cur_len, retdest
     %encode_rlp_string
-    // stack: rlp_pos, old_rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     SWAP1 POP
-    // stack: rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, payload_len_ptr, cur_len, retdest
     // Encode logs prefix.
     DUP2 %add_const(259) %mload_trie_data
-    // stack: logs_payload_len, rlp_pos, payload_len_ptr, retdest
+    // stack: logs_payload_len, rlp_addr, payload_len_ptr, cur_len, retdest
     SWAP1 %encode_rlp_list_prefix
-    // stack: rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, payload_len_ptr, cur_len, retdest
     DUP2 %add_const(261)
-    // stack: logs_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: logs_ptr, rlp_addr, payload_len_ptr, cur_len, retdest
     DUP3 %add_const(260) %mload_trie_data
-    // stack: num_logs, logs_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: num_logs, logs_ptr, rlp_addr, payload_len_ptr, cur_len, retdest
     PUSH 0
 
 encode_receipt_logs_loop:
-    // stack: i, num_logs, current_log_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: i, num_logs, current_log_ptr, rlp_addr, payload_len_ptr, cur_len, retdest
     DUP2 DUP2 EQ
-    // stack: i == num_logs, i, num_logs, current_log_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: i == num_logs, i, num_logs, current_log_ptr, rlp_addr, payload_len_ptr, cur_len, retdest
     %jumpi(encode_receipt_end)
-    // stack: i, num_logs, current_log_ptr, rlp_pos, payload_len_ptr, retdest
+    // We add 4 to the trie data length for the fixed size elements in the current log.
+    SWAP5 %add_const(4) SWAP5
+    // stack: i, num_logs, current_log_ptr, rlp_addr, payload_len_ptr, cur_len, retdest
     DUP3 DUP5
-    // stack: rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     // Encode log prefix.
     DUP2 %mload_trie_data
-    // stack: payload_len, rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: payload_len, rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     SWAP1 %encode_rlp_list_prefix
-    // stack: rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     // Encode address.
     DUP2 %increment %mload_trie_data
-    // stack: address, rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: address, rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     SWAP1 %encode_rlp_160
-    // stack: rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     DUP2 %add_const(2) %mload_trie_data
-    // stack: num_topics, rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: num_topics, rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     // Encode topics prefix.
     DUP1 %mul_const(33)
-    // stack: topics_payload_len, num_topics, rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: topics_payload_len, num_topics, rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     DUP3 %encode_rlp_list_prefix
-    // stack: new_rlp_pos, num_topics, rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: new_rlp_pos, num_topics, rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
     SWAP2 POP
-    // stack: num_topics, rlp_pos, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: num_topics, rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len, retdest
+
+    // Add `num_topics` to the length of the trie data segment.
+    DUP1 SWAP9 
+    // stack: cur_len, num_topics, rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, num_topics, retdest
+    ADD SWAP8
+
+    // stack: num_topics, rlp_addr, current_log_ptr, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     SWAP2 %add_const(3)
-    // stack: topics_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: topics_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     PUSH 0
 
 encode_receipt_topics_loop:
-    // stack: j, topics_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: j, topics_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     DUP4 DUP2 EQ
-    // stack: j == num_topics, j, topics_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: j == num_topics, j, topics_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     %jumpi(encode_receipt_topics_end)
-    // stack: j, topics_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: j, topics_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     DUP2 DUP2 ADD
     %mload_trie_data
-    // stack: current_topic, j, topics_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: current_topic, j, topics_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     DUP4
-    // stack: rlp_pos, current_topic, j, topics_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, current_topic, j, topics_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     %encode_rlp_256
-    // stack: new_rlp_pos, j, topics_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: new_rlp_pos, j, topics_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     SWAP3 POP
-    // stack: j, topics_ptr, new_rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: j, topics_ptr, new_rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     %increment
     %jump(encode_receipt_topics_loop)
 
 encode_receipt_topics_end:
-    // stack: num_topics, topics_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: num_topics, topics_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     ADD
-    // stack: data_len_ptr, rlp_pos, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: data_len_ptr, rlp_addr, num_topics, i, num_logs, current_log_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     SWAP5 POP
-    // stack: rlp_pos, num_topics, i, num_logs, data_len_ptr, old_rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, num_topics, i, num_logs, data_len_ptr, old_rlp_pos, payload_len_ptr, cur_len', retdest
     SWAP5 POP
-    // stack: num_topics, i, num_logs, data_len_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: num_topics, i, num_logs, data_len_ptr, rlp_addr, payload_len_ptr, cur_len', retdest
     POP
-    // stack: i, num_logs, data_len_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: i, num_logs, data_len_ptr, rlp_addr, payload_len_ptr, cur_len', retdest
     // Encode data prefix.
     DUP3 %mload_trie_data
-    // stack: data_len, i, num_logs, data_len_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: data_len, i, num_logs, data_len_ptr, rlp_addr, payload_len_ptr, cur_len', retdest
+
+    // Add `data_len` to the length of the trie data.
+    DUP1 SWAP7 ADD SWAP6
+
+    // stack: data_len, i, num_logs, data_len_ptr, rlp_addr, payload_len_ptr, cur_len'', retdest
     DUP4 %increment DUP2 ADD
-    // stack: next_log_ptr, data_len, i, num_logs, data_len_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: next_log_ptr, data_len, i, num_logs, data_len_ptr, rlp_addr, payload_len_ptr, cur_len'', retdest
     SWAP4 %increment
-    // stack: data_ptr, data_len, i, num_logs, next_log_ptr, rlp_pos, payload_len_ptr, retdest
-    PUSH @SEGMENT_TRIE_DATA PUSH 0
-    // stack: SRC, data_len, i, num_logs, next_log_ptr, rlp_pos, payload_len_ptr, retdest
-    DUP8
-    // stack: rlp_pos, SRC, data_len, i, num_logs, next_log_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: data_ptr, data_len, i, num_logs, next_log_ptr, rlp_addr, payload_len_ptr, cur_len'', retdest
+    PUSH @SEGMENT_TRIE_DATA %build_kernel_address
+    // stack: SRC, data_len, i, num_logs, next_log_ptr, rlp_addr, payload_len_ptr, cur_len'', retdest
+    DUP6
+    // stack: rlp_addr, SRC, data_len, i, num_logs, next_log_ptr, rlp_addr, payload_len_ptr, cur_len'', retdest
     %encode_rlp_string
-    // stack: new_rlp_pos, i, num_logs, next_log_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: new_rlp_pos, i, num_logs, next_log_ptr, rlp_addr, payload_len_ptr, cur_len'', retdest
     SWAP4 POP
-    // stack: i, num_logs, next_log_ptr, new_rlp_pos, payload_len_ptr, retdest
+    // stack: i, num_logs, next_log_ptr, new_rlp_pos, payload_len_ptr, cur_len'', retdest
     %increment
     %jump(encode_receipt_logs_loop)
 
 encode_receipt_end:
-    // stack: num_logs, num_logs, current_log_ptr, rlp_pos, payload_len_ptr, retdest
+    // stack: num_logs, num_logs, current_log_ptr, rlp_addr, payload_len_ptr, cur_len'', retdest
     %pop3
-    // stack: rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, payload_len_ptr, cur_len'', retdest
     SWAP1 POP
-    // stack: rlp_pos, retdest
-    SWAP1
+    // stack: rlp_addr, cur_len'', retdest
+    %stack(rlp_addr, new_len, retdest) -> (retdest, rlp_addr, new_len)
     JUMP
 
 encode_nonzero_receipt_type:
-    // stack: txn_type, rlp_pos, value_ptr, retdest
+    // stack: txn_type, rlp_addr, value_ptr, cur_len, retdest
+    // We have a nonlegacy receipt, so the type is also stored in the trie data segment.
+    SWAP3 %increment SWAP3
+    // stack: txn_type, rlp_addr, value_ptr, cur_len, retdest
     DUP3 %increment %mload_trie_data
-    // stack: payload_len, txn_type, rlp_pos, value_ptr, retdest
+    // stack: payload_len, txn_type, rlp_addr, value_ptr, retdest
     // The transaction type is encoded in 1 byte
     %increment %rlp_list_len
-    // stack: rlp_receipt_len, txn_type, rlp_pos, value_ptr, retdest
+    // stack: rlp_receipt_len, txn_type, rlp_addr, value_ptr, retdest
     DUP3 %encode_rlp_multi_byte_string_prefix
-    // stack: rlp_pos, txn_type, old_rlp_pos, value_ptr, retdest
-    DUP2 DUP2
-    %mstore_rlp
+    // stack: rlp_addr, txn_type, old_rlp_addr, value_ptr, retdest
+    DUP1 DUP3
+    MSTORE_GENERAL
     %increment
-    // stack: rlp_pos, txn_type, old_rlp_pos, value_ptr, retdest
-    %stack (rlp_pos, txn_type, old_rlp_pos, value_ptr, retdest) -> (rlp_pos, value_ptr, retdest)
+    // stack: rlp_addr, txn_type, old_rlp_addr, value_ptr, retdest
+    %stack (rlp_addr, txn_type, old_rlp_addr, value_ptr, retdest) -> (rlp_addr, value_ptr, retdest)
     // We replace `value_ptr` with `paylaod_len_ptr` so we can encode the rest of the data more easily
     SWAP1 %increment SWAP1
-    // stack: rlp_pos, payload_len_ptr, retdest
+    // stack: rlp_addr, payload_len_ptr, retdest
     %jump(encode_receipt_after_type)
 
 global encode_storage_value:
-    // stack: rlp_pos, value_ptr, retdest
+    // stack: rlp_addr, value_ptr, cur_len, retdest
     SWAP1 %mload_trie_data SWAP1
-    // stack: rlp_pos, value, retdest
+
+    // A storage value is a scalar, so we only need to add 1 to the trie data length.
+    SWAP2 %increment SWAP2
+
+    // stack: rlp_addr, value, cur_len, retdest
     // The YP says storage trie is a map "... to the RLP-encoded 256-bit integer values"
     // which seems to imply that this should be %encode_rlp_256. But %encode_rlp_scalar
     // causes the tests to pass, so it seems storage values should be treated as variable-
     // length after all.
     %doubly_encode_rlp_scalar
-    // stack: rlp_pos', retdest
-    SWAP1
+    // stack: rlp_addr', cur_len, retdest
+    %stack (rlp_addr, cur_len, retdest) -> (retdest, rlp_addr, cur_len)
     JUMP
 
diff --git a/evm/src/cpu/kernel/asm/mpt/hex_prefix.asm b/evm/src/cpu/kernel/asm/mpt/hex_prefix.asm
index b7a3073ba2..0ca2458f0c 100644
--- a/evm/src/cpu/kernel/asm/mpt/hex_prefix.asm
+++ b/evm/src/cpu/kernel/asm/mpt/hex_prefix.asm
@@ -3,20 +3,16 @@
 // given position, and returns the updated position, i.e. a pointer to the next
 // unused offset.
 //
-// Pre stack: rlp_start_pos, num_nibbles, packed_nibbles, terminated, retdest
-// Post stack: rlp_end_pos
-
+// Pre stack: rlp_start_addr, num_nibbles, packed_nibbles, terminated, retdest
+// Post stack: rlp_end_addr
 global hex_prefix_rlp:
-    // stack: rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-    // We will iterate backwards, from i = num_nibbles / 2 to i = 0, so that we
-    // can take nibbles from the least-significant end of packed_nibbles.
-    PUSH 2 DUP3 DIV // i = num_nibbles / 2
-    // stack: i, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-
+    DUP2 %assert_lt_const(65)
+    
+    PUSH 2 DUP3 DIV 
     // Compute the length of the hex-prefix string, in bytes:
     // hp_len = num_nibbles / 2 + 1 = i + 1
-    DUP1 %increment
-    // stack: hp_len, i, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
+    %increment
+    // stack: hp_len, rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
 
     // Write the RLP header.
     DUP1 %gt_const(55) %jumpi(rlp_header_large)
@@ -25,80 +21,111 @@ global hex_prefix_rlp:
     // The hex-prefix is a single byte. It must be <= 127, since its first
     // nibble only has two bits. So this is the "small" RLP string case, where
     // the byte is its own RLP encoding.
-    // stack: hp_len, i, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-    %jump(start_loop)
-
-rlp_header_medium:
-    // stack: hp_len, i, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-    DUP1 %add_const(0x80) // value = 0x80 + hp_len
-    DUP4 // offset = rlp_pos
-    %mstore_rlp
+    // stack: hp_len, rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
+    POP
+first_byte:
+    // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
+    // get the first nibble, if num_nibbles is odd, or zero otherwise
+    SWAP2
+    // stack: packed_nibbles, num_nibbles, rlp_addr, terminated, retdest
+    DUP2 
+    PUSH 2 DUP2 MOD
+    // stack: parity, num_nibbles, packed_nibbles, num_nibbles, rlp_addr, terminated, retdest
+    SWAP1 SUB
+    %mul_const(4)
+    SHR
+    // stack: first_nibble_or_zero, num_nibbles, rlp_addr, terminated, retdest
+    SWAP2
+    // stack: rlp_addr, num_nibbles, first_nibble_or_zero, terminated, retdest
+    SWAP3
+    // stack: terminated, num_nibbles, first_nibble_or_zero, rlp_addr, retdest
+    %mul_const(2)
+    // stack: terminated * 2, num_nibbles, first_nibble_or_zero, rlp_addr, retdest
+    SWAP1
+    // stack: num_nibbles, terminated * 2, first_nibble_or_zero, rlp_addr, retdest
+    %mod_const(2) // parity
+    ADD
+    // stack: parity + terminated * 2, first_nibble_or_zero, rlp_addr, retdest
+    %mul_const(16)
+    ADD
+    // stack: first_byte, rlp_addr, retdest
+    DUP2
+    %swap_mstore
+    %increment
+    // stack: rlp_addr', retdest
+    SWAP1
+    JUMP
+    
+remaining_bytes:
+    // stack: rlp_addr, num_nibbles, packed_nibbles, retdest
+    SWAP2
+    PUSH @U256_MAX
+    // stack: U256_MAX, packed_nibbles, num_nibbles, rlp_addr, ret_dest
+    SWAP1 SWAP2
+    PUSH 2 DUP2 MOD
+    // stack: parity, num_nibbles, U256_MAX, packed_nibbles, rlp_addr, ret_dest
+    SWAP1 SUB DUP1
+    // stack: num_nibbles - parity, num_nibbles - parity, U256_MAX, packed_nibbles, rlp_addr, ret_dest
+    %div2
+    // stack: rem_bytes, num_nibbles - parity, U256_MAX, packed_nibbles, rlp_addr, ret_dest
+    SWAP2 SWAP1
+    // stack: num_nibbles - parity, U256_MAX, rem_bytes, packed_nibbles, rlp_addr, ret_dest
+    %mul_const(4)
+    // stack: 4*(num_nibbles - parity), U256_MAX, rem_bytes, packed_nibbles, rlp_addr, ret_dest
+    PUSH 256 SUB
+    // stack: 256 - 4*(num_nibbles - parity), U256_MAX, rem_bytes, packed_nibbles, rlp_addr, ret_dest
+    SHR
+    // stack: mask, rem_bytes, packed_nibbles, rlp_addr, ret_dest
+    SWAP1 SWAP2
+    AND
+    %stack(remaining_nibbles, rem_bytes, rlp_addr) -> (rlp_addr, remaining_nibbles, rem_bytes)
+    %mstore_unpacking
+    SWAP1
+    JUMP
 
-    // rlp_pos += 1
-    SWAP2 %increment SWAP2
 
-    %jump(start_loop)
+rlp_header_medium:
+    // stack: hp_len, rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
+    %add_const(0x80) // value = 0x80 + hp_len
+    DUP2
+    %swap_mstore
+    // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
+    // rlp_addr += 1
+    %increment
+
+    // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
+    SWAP3 DUP3 DUP3
+    // stack: num_nibbles, packed_nibbles, terminated, num_nibbles, packed_nibbles, rlp_addr, retdest
+    PUSH remaining_bytes
+    // stack: remaining_bytes, num_nibbles, packed_nibbles, terminated, num_nibbles, packed_nibbles, rlp_addr, retdest
+    SWAP4 SWAP5 SWAP6
+    // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, remaining_bytes, num_nibbles, packed_nibbles, retdest
+
+    %jump(first_byte)
 
 rlp_header_large:
-    // stack: hp_len, i, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
+    // stack: hp_len, rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
     // In practice hex-prefix length will never exceed 256, so the length of the
     // length will always be 1 byte in this case.
 
+    DUP2 // rlp_addr
     PUSH 0xb8 // value = 0xb7 + len_of_len = 0xb8
-    DUP4 // offset = rlp_pos
-    %mstore_rlp
-
-    DUP1 // value = hp_len
-    DUP4 %increment // offset = rlp_pos + 1
-    %mstore_rlp
-
-    // rlp_pos += 2
-    SWAP2 %add_const(2) SWAP2
-
-start_loop:
-    // stack: hp_len, i, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-    SWAP1
+    MSTORE_GENERAL
 
-loop:
-    // stack: i, hp_len, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-    // If i == 0, break to first_byte.
-    DUP1 ISZERO %jumpi(first_byte)
+    // stack: hp_len, rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
+    DUP2 %increment
+    %swap_mstore
 
-    // stack: i, hp_len, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-    DUP5 // packed_nibbles
-    %and_const(0xFF)
-    // stack: byte_i, i, hp_len, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-    DUP4 // rlp_pos
-    DUP3 // i
-    ADD // We'll write to offset rlp_pos + i
-    %mstore_rlp
+    // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
+    // rlp_addr += 2
+    %add_const(2)
 
-    // stack: i, hp_len, rlp_pos, num_nibbles, packed_nibbles, terminated, retdest
-    %decrement
-    SWAP4 %shr_const(8) SWAP4 // packed_nibbles >>= 8
-    %jump(loop)
+    // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
+    SWAP3 DUP3 DUP3
+    // stack: num_nibbles, packed_nibbles, terminated, num_nibbles, packed_nibbles, rlp_addr, retdest
+    PUSH remaining_bytes
+    // stack: remaining_bytes, num_nibbles, packed_nibbles, terminated, num_nibbles, packed_nibbles, rlp_addr, retdest
+    SWAP4 SWAP5 SWAP6
+    // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, remaining_bytes, num_nibbles, packed_nibbles, retdest
 
-first_byte:
-    // stack: 0, hp_len, rlp_pos, num_nibbles, first_nibble_or_zero, terminated, retdest
-    POP
-    // stack: hp_len, rlp_pos, num_nibbles, first_nibble_or_zero, terminated, retdest
-    DUP2 ADD
-    // stack: rlp_end_pos, rlp_pos, num_nibbles, first_nibble_or_zero, terminated, retdest
-    SWAP4
-    // stack: terminated, rlp_pos, num_nibbles, first_nibble_or_zero, rlp_end_pos, retdest
-    %mul_const(2)
-    // stack: terminated * 2, rlp_pos, num_nibbles, first_nibble_or_zero, rlp_end_pos, retdest
-    %stack (terminated_x2, rlp_pos, num_nibbles, first_nibble_or_zero)
-        -> (num_nibbles, terminated_x2, first_nibble_or_zero, rlp_pos)
-    // stack: num_nibbles, terminated * 2, first_nibble_or_zero, rlp_pos, rlp_end_pos, retdest
-    %mod_const(2) // parity
-    ADD
-    // stack: parity + terminated * 2, first_nibble_or_zero, rlp_pos, rlp_end_pos, retdest
-    %mul_const(16)
-    ADD
-    // stack: first_byte, rlp_pos, rlp_end_pos, retdest
-    SWAP1
-    %mstore_rlp
-    // stack: rlp_end_pos, retdest
-    SWAP1
-    JUMP
+    %jump(first_byte)
diff --git a/evm/src/cpu/kernel/asm/mpt/insert/insert_extension.asm b/evm/src/cpu/kernel/asm/mpt/insert/insert_extension.asm
index 3ead805b1d..21a4b7558b 100644
--- a/evm/src/cpu/kernel/asm/mpt/insert/insert_extension.asm
+++ b/evm/src/cpu/kernel/asm/mpt/insert/insert_extension.asm
@@ -74,9 +74,21 @@ node_key_continues:
     // Pseudocode: new_node = [MPT_TYPE_BRANCH] + [0] * 17
     %get_trie_data_size // pointer to the branch node we're about to create
     PUSH @MPT_NODE_BRANCH %append_to_trie_data
-    %rep 17
-        PUSH 0 %append_to_trie_data
-    %endrep
+
+    PUSH 0
+    // Increment trie data size by 17
+    %get_trie_data_size
+    // stack: trie_data_size, 0
+    DUP1
+    %add_const(17)
+    %set_trie_data_size
+
+    // stack: trie_data_size, 0
+
+    // Write 17 consecutive 0s at once
+    PUSH @SEGMENT_TRIE_DATA %build_kernel_address
+    MSTORE_32BYTES_17
+    POP
 
 process_node_child:
     // stack: new_node_ptr, common_len, common_key, node_len, node_key, insert_len, insert_key, node_child_ptr, insert_value_ptr, retdest
diff --git a/evm/src/cpu/kernel/asm/mpt/insert/insert_leaf.asm b/evm/src/cpu/kernel/asm/mpt/insert/insert_leaf.asm
index 72a014ceec..806fc0ddbd 100644
--- a/evm/src/cpu/kernel/asm/mpt/insert/insert_leaf.asm
+++ b/evm/src/cpu/kernel/asm/mpt/insert/insert_leaf.asm
@@ -69,9 +69,22 @@ global mpt_insert_leaf:
     // For now, we allocate the branch node, initially with no children or value.
     %get_trie_data_size  // pointer to the branch node we're about to create
     PUSH @MPT_NODE_BRANCH %append_to_trie_data
-    %rep 17
-        PUSH 0 %append_to_trie_data
-    %endrep
+
+    PUSH 0
+    // Increment trie data size by 17
+    %get_trie_data_size
+    // stack: trie_data_size, 0
+    DUP1
+    %add_const(17)
+    %set_trie_data_size
+
+    // stack: trie_data_size, 0
+
+    // Write 17 consecutive 0s at once
+    PUSH @SEGMENT_TRIE_DATA %build_kernel_address
+    MSTORE_32BYTES_17
+    POP
+
     // stack: branch_ptr, common_len, common_key, node_len, node_key, insert_len, insert_key, node_value_ptr, insert_value_ptr, retdest
 
     // Now, we branch based on whether each key continues beyond the common
diff --git a/evm/src/cpu/kernel/asm/mpt/insert/insert_trie_specific.asm b/evm/src/cpu/kernel/asm/mpt/insert/insert_trie_specific.asm
index 1bf9f6f8fb..71f78ec5bd 100644
--- a/evm/src/cpu/kernel/asm/mpt/insert/insert_trie_specific.asm
+++ b/evm/src/cpu/kernel/asm/mpt/insert/insert_trie_specific.asm
@@ -71,18 +71,18 @@ mpt_insert_receipt_trie_save:
 global scalar_to_rlp:
     // stack: scalar, retdest
     %mload_global_metadata(@GLOBAL_METADATA_RLP_DATA_SIZE)
-    // stack: pos, scalar, retdest
+    // stack: init_addr, scalar, retdest
     SWAP1 DUP2
     %encode_rlp_scalar
-    // stack: pos', init_pos, retdest
+    // stack: addr', init_addr, retdest
     // Now our rlp_encoding is in RlpRaw.
     // Set new RlpRaw data size
     DUP1 %mstore_global_metadata(@GLOBAL_METADATA_RLP_DATA_SIZE)
     DUP2 DUP2 SUB // len of the key
-    // stack: len, pos', init_pos, retdest
-    DUP3 PUSH @SEGMENT_RLP_RAW PUSH 0 // address where we get the key from
-    %mload_packing
-    // stack: packed_key, pos', init_pos, retdest
+    // stack: len, addr', init_addr, retdest
+    DUP3
+    MLOAD_32BYTES
+    // stack: packed_key, addr', init_addr, retdest
     SWAP2 %pop2
     // stack: key, retdest
     SWAP1
diff --git a/evm/src/cpu/kernel/asm/mpt/load/load.asm b/evm/src/cpu/kernel/asm/mpt/load/load.asm
deleted file mode 100644
index d787074b4f..0000000000
--- a/evm/src/cpu/kernel/asm/mpt/load/load.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-// Load all partial trie data from prover inputs.
-global load_all_mpts:
-    // stack: retdest
-    // First set @GLOBAL_METADATA_TRIE_DATA_SIZE = 1.
-    // We don't want it to start at 0, as we use 0 as a null pointer.
-    PUSH 1
-    %set_trie_data_size
-
-    %load_mpt(mpt_load_state_trie_value)   %mstore_global_metadata(@GLOBAL_METADATA_STATE_TRIE_ROOT)
-    %load_mpt(mpt_load_txn_trie_value)     %mstore_global_metadata(@GLOBAL_METADATA_TXN_TRIE_ROOT)
-    %load_mpt(mpt_load_receipt_trie_value) %mstore_global_metadata(@GLOBAL_METADATA_RECEIPT_TRIE_ROOT)
-
-    // stack: retdest
-    JUMP
-
-// Load an MPT from prover inputs.
-// Pre stack: load_value, retdest
-// Post stack: node_ptr
-global load_mpt:
-    // stack: load_value, retdest
-    PROVER_INPUT(mpt)
-    // stack: node_type, load_value, retdest
-
-    DUP1 %eq_const(@MPT_NODE_EMPTY)     %jumpi(load_mpt_empty)
-    DUP1 %eq_const(@MPT_NODE_BRANCH)    %jumpi(load_mpt_branch)
-    DUP1 %eq_const(@MPT_NODE_EXTENSION) %jumpi(load_mpt_extension)
-    DUP1 %eq_const(@MPT_NODE_LEAF)      %jumpi(load_mpt_leaf)
-    DUP1 %eq_const(@MPT_NODE_HASH)      %jumpi(load_mpt_digest)
-    PANIC // Invalid node type
-
-load_mpt_empty:
-    // TRIE_DATA[0] = 0, and an empty node has type 0, so we can simply return the null pointer.
-    %stack (node_type, load_value, retdest) -> (retdest, 0)
-    JUMP
-
-load_mpt_branch:
-    // stack: node_type, load_value, retdest
-    %get_trie_data_size
-    // stack: node_ptr, node_type, load_value, retdest
-    SWAP1 %append_to_trie_data
-    // stack: node_ptr, load_value, retdest
-    // Save the offset of our 16 child pointers so we can write them later.
-    // Then advance our current trie pointer beyond them, so we can load the
-    // value and have it placed after our child pointers.
-    %get_trie_data_size
-    // stack: children_ptr, node_ptr, load_value, retdest
-    DUP1 %add_const(17) // Skip over 16 children plus the value pointer
-    // stack: end_of_branch_ptr, children_ptr, node_ptr, load_value, retdest
-    DUP1 %set_trie_data_size
-    // Now the top of the stack points to where the branch node will end and the
-    // value will begin, if there is a value. But we need to ask the prover if a
-    // value is present, and point to null if not.
-    // stack: end_of_branch_ptr, children_ptr, node_ptr, load_value, retdest
-    PROVER_INPUT(mpt)
-    // stack: is_value_present, end_of_branch_ptr, children_ptr, node_ptr, load_value, retdest
-    %jumpi(load_mpt_branch_value_present)
-    // There is no value present, so value_ptr = null.
-    %stack (end_of_branch_ptr) -> (0)
-    // stack: value_ptr, children_ptr, node_ptr, load_value, retdest
-    %jump(load_mpt_branch_after_load_value)
-load_mpt_branch_value_present:
-    // stack: value_ptr, children_ptr, node_ptr, load_value, retdest
-    PUSH load_mpt_branch_after_load_value
-    DUP5 // load_value
-    JUMP
-load_mpt_branch_after_load_value:
-    // stack: value_ptr, children_ptr, node_ptr, load_value, retdest
-    SWAP1
-    // stack: children_ptr, value_ptr, node_ptr, load_value, retdest
-
-    // Load the 16 children.
-    %rep 16
-        DUP4 // load_value
-        %load_mpt
-        // stack: child_ptr, next_child_ptr_ptr, value_ptr, node_ptr, load_value, retdest
-        DUP2
-        // stack: next_child_ptr_ptr, child_ptr, next_child_ptr_ptr, value_ptr, node_ptr, load_value, retdest
-        %mstore_trie_data
-        // stack: next_child_ptr_ptr, value_ptr, node_ptr, load_value, retdest
-        %increment
-        // stack: next_child_ptr_ptr, value_ptr, node_ptr, load_value, retdest
-    %endrep
-
-    // stack: value_ptr_ptr, value_ptr, node_ptr, load_value, retdest
-    %mstore_trie_data
-    %stack (node_ptr, load_value, retdest) -> (retdest, node_ptr)
-    JUMP
-
-load_mpt_extension:
-    // stack: node_type, load_value, retdest
-    %get_trie_data_size
-    // stack: node_ptr, node_type, load_value, retdest
-    SWAP1 %append_to_trie_data
-    // stack: node_ptr, load_value, retdest
-    PROVER_INPUT(mpt) // read num_nibbles
-    %append_to_trie_data
-    PROVER_INPUT(mpt) // read packed_nibbles
-    %append_to_trie_data
-    // stack: node_ptr, load_value, retdest
-
-    %get_trie_data_size
-    // stack: child_ptr_ptr, node_ptr, load_value, retdest
-    // Increment trie_data_size, to leave room for child_ptr_ptr, before we load our child.
-    DUP1 %increment %set_trie_data_size
-    %stack (child_ptr_ptr, node_ptr, load_value, retdest)
-        -> (load_value, load_mpt_extension_after_load_mpt,
-            child_ptr_ptr, retdest, node_ptr)
-    %jump(load_mpt)
-load_mpt_extension_after_load_mpt:
-    // stack: child_ptr, child_ptr_ptr, retdest, node_ptr
-    SWAP1 %mstore_trie_data
-    // stack: retdest, node_ptr
-    JUMP
-
-load_mpt_leaf:
-    // stack: node_type, load_value, retdest
-    %get_trie_data_size
-    // stack: node_ptr, node_type, load_value, retdest
-    SWAP1 %append_to_trie_data
-    // stack: node_ptr, load_value, retdest
-    PROVER_INPUT(mpt) // read num_nibbles
-    %append_to_trie_data
-    PROVER_INPUT(mpt) // read packed_nibbles
-    %append_to_trie_data
-    // stack: node_ptr, load_value, retdest
-    // We save value_ptr_ptr = get_trie_data_size, then increment trie_data_size
-    // to skip over the slot for value_ptr_ptr. We will write to value_ptr_ptr
-    // after the load_value call.
-    %get_trie_data_size
-    // stack: value_ptr_ptr, node_ptr, load_value, retdest
-    DUP1 %increment
-    // stack: value_ptr, value_ptr_ptr, node_ptr, load_value, retdest
-    DUP1 %set_trie_data_size
-    // stack: value_ptr, value_ptr_ptr, node_ptr, load_value, retdest
-    %stack (value_ptr, value_ptr_ptr, node_ptr, load_value, retdest)
-        -> (load_value, load_mpt_leaf_after_load_value,
-            value_ptr_ptr, value_ptr, retdest, node_ptr)
-    JUMP
-load_mpt_leaf_after_load_value:
-    // stack: value_ptr_ptr, value_ptr, retdest, node_ptr
-    %mstore_trie_data
-    // stack: retdest, node_ptr
-    JUMP
-
-load_mpt_digest:
-    // stack: node_type, load_value, retdest
-    %get_trie_data_size
-    // stack: node_ptr, node_type, load_value, retdest
-    SWAP1 %append_to_trie_data
-    // stack: node_ptr, load_value, retdest
-    PROVER_INPUT(mpt) // read digest
-    %append_to_trie_data
-    %stack (node_ptr, load_value, retdest) -> (retdest, node_ptr)
-    JUMP
-
-// Convenience macro to call load_mpt and return where we left off.
-// Pre stack: load_value
-// Post stack: node_ptr
-%macro load_mpt
-    %stack (load_value) -> (load_value, %%after)
-    %jump(load_mpt)
-%%after:
-%endmacro
-
-// Convenience macro to call load_mpt and return where we left off.
-// Pre stack: (empty)
-// Post stack: node_ptr
-%macro load_mpt(load_value)
-    PUSH %%after
-    PUSH $load_value
-    %jump(load_mpt)
-%%after:
-%endmacro
diff --git a/evm/src/cpu/kernel/asm/mpt/load/load_trie_specific.asm b/evm/src/cpu/kernel/asm/mpt/load/load_trie_specific.asm
deleted file mode 100644
index 92471fd801..0000000000
--- a/evm/src/cpu/kernel/asm/mpt/load/load_trie_specific.asm
+++ /dev/null
@@ -1,150 +0,0 @@
-global mpt_load_state_trie_value:
-    // stack: retdest
-
-    // Load and append the nonce and balance.
-    PROVER_INPUT(mpt) %append_to_trie_data
-    PROVER_INPUT(mpt) %append_to_trie_data
-
-    // Now increment the trie data size by 2, to leave room for our storage trie
-    // pointer and code hash fields, before calling load_mpt which will append
-    // our storage trie data.
-    %get_trie_data_size
-    // stack: storage_trie_ptr_ptr, retdest
-    DUP1 %add_const(2)
-    // stack: storage_trie_ptr, storage_trie_ptr_ptr, retdest
-    %set_trie_data_size
-    // stack: storage_trie_ptr_ptr, retdest
-
-    %load_mpt(mpt_load_storage_trie_value)
-    // stack: storage_trie_ptr, storage_trie_ptr_ptr, retdest
-    DUP2 %mstore_trie_data
-    // stack: storage_trie_ptr_ptr, retdest
-    %increment
-    // stack: code_hash_ptr, retdest
-    PROVER_INPUT(mpt)
-    // stack: code_hash, code_hash_ptr, retdest
-    SWAP1 %mstore_trie_data
-    // stack: retdest
-    JUMP
-
-global mpt_load_txn_trie_value:
-    // stack: retdest
-    PROVER_INPUT(mpt)
-    // stack: rlp_len, retdest
-    // The first element is the rlp length
-    DUP1 %append_to_trie_data
-    PUSH 0
-
-mpt_load_loop:
-    // stack: i, rlp_len, retdest
-    DUP2 DUP2 EQ %jumpi(mpt_load_end)
-    PROVER_INPUT(mpt) %append_to_trie_data
-    %increment
-    %jump(mpt_load_loop)
-
-mpt_load_end:
-    // stack: i, rlp_len, retdest
-    %pop2
-    JUMP
-
-global mpt_load_receipt_trie_value:
-    // stack: retdest
-    // Load first byte. It is either `payload_len` or the transaction type.
-    PROVER_INPUT(mpt) DUP1 %append_to_trie_data
-    // If the first byte is less than 3, then it is the transaction type, equal to either 1 or 2. 
-    // In that case, we still need to load the payload length.
-    %lt_const(3) %jumpi(mpt_load_payload_len)
-    
-mpt_load_after_type:
-    // Load status.
-    PROVER_INPUT(mpt) %append_to_trie_data
-    // Load cum_gas_used.
-    PROVER_INPUT(mpt) %append_to_trie_data
-    // Load bloom.
-    %rep 256
-        PROVER_INPUT(mpt) %append_to_trie_data
-    %endrep
-    // Load logs_payload_len.
-    PROVER_INPUT(mpt) %append_to_trie_data
-    // Load num_logs.
-    PROVER_INPUT(mpt)
-    DUP1
-    %append_to_trie_data
-    // stack: num_logs, retdest
-    // Load logs.
-    PUSH 0
-
-mpt_load_receipt_trie_value_logs_loop:
-    // stack: i, num_logs, retdest
-    DUP2 DUP2 EQ
-    // stack: i == num_logs, i, num_logs, retdest
-    %jumpi(mpt_load_receipt_trie_value_end)
-    // stack: i, num_logs, retdest
-    // Load log_payload_len.
-    PROVER_INPUT(mpt) %append_to_trie_data
-    // Load address.
-    PROVER_INPUT(mpt) %append_to_trie_data
-    // Load num_topics.
-    PROVER_INPUT(mpt)
-    DUP1
-    %append_to_trie_data
-    // stack: num_topics, i, num_logs, retdest
-    // Load topics.
-    PUSH 0
-
-mpt_load_receipt_trie_value_topics_loop:
-    // stack: j, num_topics, i, num_logs, retdest
-    DUP2 DUP2 EQ
-    // stack: j == num_topics, j, num_topics, i, num_logs, retdest
-    %jumpi(mpt_load_receipt_trie_value_topics_end)
-    // stack: j, num_topics, i, num_logs, retdest
-    // Load topic.
-    PROVER_INPUT(mpt) %append_to_trie_data
-    %increment
-    %jump(mpt_load_receipt_trie_value_topics_loop)
-
-mpt_load_receipt_trie_value_topics_end:
-    // stack: num_topics, num_topics, i, num_logs, retdest
-    %pop2
-    // stack: i, num_logs, retdest
-    // Load data_len.
-    PROVER_INPUT(mpt) 
-    DUP1
-    %append_to_trie_data
-    // stack: data_len, i, num_logs, retdest
-    // Load data.
-    PUSH 0
-
-mpt_load_receipt_trie_value_data_loop:
-    // stack: j, data_len, i, num_logs, retdest
-    DUP2 DUP2 EQ
-    // stack: j == data_len, j, data_len, i, num_logs, retdest
-    %jumpi(mpt_load_receipt_trie_value_data_end)
-    // stack: j, data_len, i, num_logs, retdest
-    // Load data byte.
-    PROVER_INPUT(mpt) %append_to_trie_data
-    %increment
-    %jump(mpt_load_receipt_trie_value_data_loop)
-
-mpt_load_receipt_trie_value_data_end:
-    // stack: data_len, data_len, i, num_logs, retdest
-    %pop2
-    %increment
-    %jump(mpt_load_receipt_trie_value_logs_loop)
-
-mpt_load_receipt_trie_value_end:
-    // stack: num_logs, num_logs, retdest
-    %pop2
-    JUMP
-
-mpt_load_payload_len:
-    // stack: retdest
-    PROVER_INPUT(mpt) %append_to_trie_data
-    %jump(mpt_load_after_type)
-
-global mpt_load_storage_trie_value:
-    // stack: retdest
-    PROVER_INPUT(mpt)
-    %append_to_trie_data
-    // stack: retdest
-    JUMP
diff --git a/evm/src/cpu/kernel/asm/mpt/util.asm b/evm/src/cpu/kernel/asm/mpt/util.asm
index 80e5c6f7c5..9829494c2f 100644
--- a/evm/src/cpu/kernel/asm/mpt/util.asm
+++ b/evm/src/cpu/kernel/asm/mpt/util.asm
@@ -10,6 +10,12 @@
     // stack: (empty)
 %endmacro
 
+%macro initialize_rlp_segment
+    PUSH @ENCODED_EMPTY_NODE_POS
+    PUSH 0x80
+    MSTORE_GENERAL
+%endmacro
+
 %macro alloc_rlp_block
     // stack: (empty)
     %mload_global_metadata(@GLOBAL_METADATA_RLP_DATA_SIZE)
@@ -17,7 +23,7 @@
     // In our model it's fine to use memory in a sparse way, as long as the gaps aren't larger than
     // 2^16 or so. So instead of the caller specifying the size of the block they need, we'll just
     // allocate 0x10000 = 2^16 bytes, much larger than any RLP blob the EVM could possibly create.
-    DUP1 %add_const(0x10000)
+    DUP1 %add_const(@MAX_RLP_BLOB_SIZE)
     // stack: block_end, block_start
     %mstore_global_metadata(@GLOBAL_METADATA_RLP_DATA_SIZE)
     // stack: block_start
@@ -152,9 +158,9 @@
     DUP3 DUP6 MUL ISZERO %jumpi(%%return)
 
     // first_nib_2 = (key_2 >> (bits_2 - 4)) & 0xF
-    DUP6 DUP6 %sub_const(4) SHR %and_const(0xF)
+    DUP6 PUSH 4 DUP7 SUB SHR %and_const(0xF)
     // first_nib_1 = (key_1 >> (bits_1 - 4)) & 0xF
-    DUP5 DUP5 %sub_const(4) SHR %and_const(0xF)
+    DUP5 PUSH 4 DUP6 SUB SHR %and_const(0xF)
     // stack: first_nib_1, first_nib_2, len_common, key_common, bits_1, key_1, bits_2, key_2
 
     // if first_nib_1 != first_nib_2: break
@@ -198,8 +204,8 @@
     %pop2
 %%return:
     // stack: len_common, key_common, bits_1, key_1, bits_2, key_2
-    SWAP2 %div_const(4) SWAP2 // bits_1 -> len_1 (in nibbles)
-    SWAP4 %div_const(4) SWAP4 // bits_2 -> len_2 (in nibbles)
+    SWAP2 %shr_const(2) SWAP2 // bits_1 -> len_1 (in nibbles)
+    SWAP4 %shr_const(2) SWAP4 // bits_2 -> len_2 (in nibbles)
     // stack: len_common, key_common, len_1, key_1, len_2, key_2
 %endmacro
 
diff --git a/evm/src/cpu/kernel/asm/rlp/decode.asm b/evm/src/cpu/kernel/asm/rlp/decode.asm
index 327edcf1c5..43c6627d6c 100644
--- a/evm/src/cpu/kernel/asm/rlp/decode.asm
+++ b/evm/src/cpu/kernel/asm/rlp/decode.asm
@@ -7,161 +7,141 @@
 // assets.
 
 // Parse the length of a bytestring from RLP memory. The next len bytes after
-// pos' will contain the string.
+// rlp_addr' will contain the string.
 //
-// Pre stack: pos, retdest
-// Post stack: pos', len
+// Pre stack: rlp_addr, retdest
+// Post stack: rlp_addr', len
 global decode_rlp_string_len:
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     DUP1
-    %mload_kernel(@SEGMENT_RLP_RAW)
-    // stack: first_byte, pos, retdest
+    MLOAD_GENERAL
+    // stack: first_byte, rlp_addr, retdest
     DUP1
     %gt_const(0xb7)
-    // stack: first_byte >= 0xb8, first_byte, pos, retdest
+    // stack: first_byte >= 0xb8, first_byte, rlp_addr, retdest
     %jumpi(decode_rlp_string_len_large)
-    // stack: first_byte, pos, retdest
+    // stack: first_byte, rlp_addr, retdest
     DUP1
     %gt_const(0x7f)
-    // stack: first_byte >= 0x80, first_byte, pos, retdest
+    // stack: first_byte >= 0x80, first_byte, rlp_addr, retdest
     %jumpi(decode_rlp_string_len_medium)
 
     // String is a single byte in the range [0x00, 0x7f].
-    %stack (first_byte, pos, retdest) -> (retdest, pos, 1)
+    %stack (first_byte, rlp_addr, retdest) -> (retdest, rlp_addr, 1)
     JUMP
 
 decode_rlp_string_len_medium:
     // String is 0-55 bytes long. First byte contains the len.
-    // stack: first_byte, pos, retdest
+    // stack: first_byte, rlp_addr, retdest
     %sub_const(0x80)
-    // stack: len, pos, retdest
+    // stack: len, rlp_addr, retdest
     SWAP1
     %increment
-    // stack: pos', len, retdest
-    %stack (pos, len, retdest) -> (retdest, pos, len)
+    // stack: rlp_addr', len, retdest
+    %stack (rlp_addr, len, retdest) -> (retdest, rlp_addr, len)
     JUMP
 
 decode_rlp_string_len_large:
     // String is >55 bytes long. First byte contains the len of the len.
-    // stack: first_byte, pos, retdest
+    // stack: first_byte, rlp_addr, retdest
     %sub_const(0xb7)
-    // stack: len_of_len, pos, retdest
+    // stack: len_of_len, rlp_addr, retdest
     SWAP1
     %increment
-    // stack: pos', len_of_len, retdest
+    // stack: rlp_addr', len_of_len, retdest
     %jump(decode_int_given_len)
 
 // Convenience macro to call decode_rlp_string_len and return where we left off.
 %macro decode_rlp_string_len
-    %stack (pos) -> (pos, %%after)
+    %stack (rlp_addr) -> (rlp_addr, %%after)
     %jump(decode_rlp_string_len)
 %%after:
 %endmacro
 
 // Parse a scalar from RLP memory.
-// Pre stack: pos, retdest
-// Post stack: pos', scalar
+// Pre stack: rlp_addr, retdest
+// Post stack: rlp_addr', scalar
 //
 // Scalars are variable-length, but this method assumes a max length of 32
 // bytes, so that the result can be returned as a single word on the stack.
 // As per the spec, scalars must not have leading zeros.
 global decode_rlp_scalar:
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     PUSH decode_int_given_len
-    // stack: decode_int_given_len, pos, retdest
+    // stack: decode_int_given_len, rlp_addr, retdest
     SWAP1
-    // stack: pos, decode_int_given_len, retdest
+    // stack: rlp_addr, decode_int_given_len, retdest
     // decode_rlp_string_len will return to decode_int_given_len, at which point
-    // the stack will contain (pos', len, retdest), which are the proper args
+    // the stack will contain (rlp_addr', len, retdest), which are the proper args
     // to decode_int_given_len.
     %jump(decode_rlp_string_len)
 
 // Convenience macro to call decode_rlp_scalar and return where we left off.
 %macro decode_rlp_scalar
-    %stack (pos) -> (pos, %%after)
+    %stack (rlp_addr) -> (rlp_addr, %%after)
     %jump(decode_rlp_scalar)
 %%after:
 %endmacro
 
 // Parse the length of an RLP list from memory.
-// Pre stack: pos, retdest
-// Post stack: pos', len
+// Pre stack: rlp_addr, retdest
+// Post stack: rlp_addr', len
 global decode_rlp_list_len:
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     DUP1
-    %mload_kernel(@SEGMENT_RLP_RAW)
-    // stack: first_byte, pos, retdest
+    MLOAD_GENERAL
+    // stack: first_byte, rlp_addr, retdest
     SWAP1
-    %increment // increment pos
+    %increment // increment rlp_addr
     SWAP1
-    // stack: first_byte, pos', retdest
+    // stack: first_byte, rlp_addr', retdest
     // If first_byte is >= 0xf8, it's a > 55 byte list, and
     // first_byte - 0xf7 is the length of the length.
     DUP1
     %gt_const(0xf7) // GT is native while GE is not, so compare to 0xf6 instead
-    // stack: first_byte >= 0xf7, first_byte, pos', retdest
+    // stack: first_byte >= 0xf7, first_byte, rlp_addr', retdest
     %jumpi(decode_rlp_list_len_big)
 
     // This is the "small list" case.
     // The list length is first_byte - 0xc0.
-    // stack: first_byte, pos', retdest
+    // stack: first_byte, rlp_addr', retdest
     %sub_const(0xc0)
-    // stack: len, pos', retdest
-    %stack (len, pos, retdest) -> (retdest, pos, len)
+    // stack: len, rlp_addr', retdest
+    %stack (len, rlp_addr, retdest) -> (retdest, rlp_addr, len)
     JUMP
 
 decode_rlp_list_len_big:
     // The length of the length is first_byte - 0xf7.
-    // stack: first_byte, pos', retdest
+    // stack: first_byte, rlp_addr', retdest
     %sub_const(0xf7)
-    // stack: len_of_len, pos', retdest
+    // stack: len_of_len, rlp_addr', retdest
     SWAP1
-    // stack: pos', len_of_len, retdest
+    // stack: rlp_addr', len_of_len, retdest
     %jump(decode_int_given_len)
 
 // Convenience macro to call decode_rlp_list_len and return where we left off.
 %macro decode_rlp_list_len
-    %stack (pos) -> (pos, %%after)
+    %stack (rlp_addr) -> (rlp_addr, %%after)
     %jump(decode_rlp_list_len)
 %%after:
 %endmacro
 
 // Parse an integer of the given length. It is assumed that the integer will
 // fit in a single (256-bit) word on the stack.
-// Pre stack: pos, len, retdest
-// Post stack: pos', int
+// Pre stack: rlp_addr, len, retdest
+// Post stack: rlp_addr', int
 global decode_int_given_len:
-    %stack (pos, len, retdest) -> (pos, len, pos, retdest)
+    DUP2 ISZERO %jumpi(empty_int)
+    %stack (rlp_addr, len, retdest) -> (rlp_addr, len, rlp_addr, len, retdest)
     ADD
-    // stack: end_pos, pos, retdest
-    SWAP1
-    // stack: pos, end_pos, retdest
-    PUSH 0 // initial accumulator state
-    // stack: acc, pos, end_pos, retdest
-
-decode_int_given_len_loop:
-    // stack: acc, pos, end_pos, retdest
-    DUP3
-    DUP3
-    EQ
-    // stack: pos == end_pos, acc, pos, end_pos, retdest
-    %jumpi(decode_int_given_len_finish)
-    // stack: acc, pos, end_pos, retdest
-    %shl_const(8)
-    // stack: acc << 8, pos, end_pos, retdest
-    DUP2
-    // stack: pos, acc << 8, pos, end_pos, retdest
-    %mload_kernel(@SEGMENT_RLP_RAW)
-    // stack: byte, acc << 8, pos, end_pos, retdest
-    ADD
-    // stack: acc', pos, end_pos, retdest
-    // Increment pos.
-    SWAP1
-    %increment
-    SWAP1
-    // stack: acc', pos', end_pos, retdest
-    %jump(decode_int_given_len_loop)
+    %stack(rlp_addr_two, rlp_addr, len, retdest) -> (rlp_addr, len, rlp_addr_two, retdest)
+    MLOAD_32BYTES
+    // stack: int, rlp_addr', retdest
+    %stack(int, rlp_addr, retdest) -> (retdest, rlp_addr, int)
+    JUMP
 
-decode_int_given_len_finish:
-    %stack (acc, pos, end_pos, retdest) -> (retdest, pos, acc)
+empty_int:
+    // stack: rlp_addr, len, retdest
+    %stack(rlp_addr, len, retdest) -> (retdest, rlp_addr, 0)
     JUMP
+
diff --git a/evm/src/cpu/kernel/asm/rlp/encode.asm b/evm/src/cpu/kernel/asm/rlp/encode.asm
index 71eeaa8a96..9f6813ab18 100644
--- a/evm/src/cpu/kernel/asm/rlp/encode.asm
+++ b/evm/src/cpu/kernel/asm/rlp/encode.asm
@@ -1,76 +1,65 @@
-// RLP-encode a fixed-length 160 bit (20 byte) string. Assumes string < 2^160.
-// Pre stack: pos, string, retdest
-// Post stack: pos
-global encode_rlp_160:
-    PUSH 20
-    %jump(encode_rlp_fixed)
-
-// Convenience macro to call encode_rlp_160 and return where we left off.
+// Convenience macro to RLP-encode a fixed-length 160 bit (20 byte) string
+// and return where we left off. Assumes string < 2^160.
+// Pre stack: rlp_addr, string, retdest
+// Post stack: rlp_addr
 %macro encode_rlp_160
-    %stack (pos, string) -> (pos, string, %%after)
-    %jump(encode_rlp_160)
+    %stack (rlp_addr, string) -> (20, rlp_addr, string, %%after)
+    %jump(encode_rlp_fixed)
 %%after:
 %endmacro
 
-// RLP-encode a fixed-length 256 bit (32 byte) string.
-// Pre stack: pos, string, retdest
-// Post stack: pos
-global encode_rlp_256:
-    PUSH 32
-    %jump(encode_rlp_fixed)
-
-// Convenience macro to call encode_rlp_256 and return where we left off.
+// Convenience macro to RLP-encode a fixed-length 256 bit (32 byte) string
+// and return where we left off.
+// Pre stack: rlp_addr, string, retdest
+// Post stack: rlp_addr
 %macro encode_rlp_256
-    %stack (pos, string) -> (pos, string, %%after)
-    %jump(encode_rlp_256)
+    %stack (rlp_addr, string) -> (32, rlp_addr, string, %%after)
+    %jump(encode_rlp_fixed)
 %%after:
 %endmacro
 
 // RLP-encode a fixed-length string with the given byte length. Assumes string < 2^(8 * len).
 global encode_rlp_fixed:
-    // stack: len, pos, string, retdest
-    DUP1
+    // stack: len, rlp_addr, string, retdest
+    DUP2
+    DUP2
     %add_const(0x80)
-    // stack: first_byte, len, pos, string, retdest
-    DUP3
-    // stack: pos, first_byte, len, pos, string, retdest
-    %mstore_rlp
-    // stack: len, pos, string, retdest
+    // stack: first_byte, rlp_addr, len, rlp_addr, string, retdest
+    MSTORE_GENERAL
+    // stack: len, rlp_addr, string, retdest
     SWAP1
-    %increment // increment pos
-    // stack: pos, len, string, retdest
-    %stack (pos, len, string) -> (pos, string, len, encode_rlp_fixed_finish)
-    // stack: pos, string, len, encode_rlp_fixed_finish, retdest
-    %jump(mstore_unpacking_rlp)
+    %increment // increment rlp_addr
+    // stack: rlp_addr, len, string, retdest
+    %stack (rlp_addr, len, string) -> (rlp_addr, string, len, encode_rlp_fixed_finish)
+    // stack: rlp_addr, string, len, encode_rlp_fixed_finish, retdest
+    %jump(mstore_unpacking)
 encode_rlp_fixed_finish:
-    // stack: pos', retdest
+    // stack: rlp_addr', retdest
     SWAP1
     JUMP
 
 // Doubly-RLP-encode a fixed-length string with the given byte length.
 // I.e. writes encode(encode(string). Assumes string < 2^(8 * len).
 global doubly_encode_rlp_fixed:
-    // stack: len, pos, string, retdest
-    DUP1
+    // stack: len, rlp_addr, string, retdest
+    DUP2
+    DUP2
     %add_const(0x81)
-    // stack: first_byte, len, pos, string, retdest
-    DUP3
-    // stack: pos, first_byte, len, pos, string, retdest
-    %mstore_rlp
-    // stack: len, pos, string, retdest
-    DUP1
+    // stack: first_byte, rlp_addr, len, rlp_addr, string, retdest
+    MSTORE_GENERAL
+    // stack: len, rlp_addr, string, retdest
+    DUP2 %increment
+    DUP2
     %add_const(0x80)
-    // stack: second_byte, len, original_pos, string, retdest
-    DUP3 %increment
-    // stack: pos', second_byte, len, pos, string, retdest
-    %mstore_rlp
-    // stack: len, pos, string, retdest
+    // stack: second_byte, rlp_addr', len, original_rlp_addr, string, retdest
+    MSTORE_GENERAL
+    // stack: len, rlp_addr, string, retdest
     SWAP1
     %add_const(2) // advance past the two prefix bytes
-    // stack: pos'', len, string, retdest
-    %stack (pos, len, string) -> (pos, string, len, encode_rlp_fixed_finish)
-    // stack: context, segment, pos'', string, len, encode_rlp_fixed_finish, retdest
-    %jump(mstore_unpacking_rlp)
+    // stack: rlp_addr'', len, string, retdest
+    %stack (rlp_addr, len, string) -> (rlp_addr, string, len, encode_rlp_fixed_finish)
+    // stack: context, segment, rlp_addr'', string, len, encode_rlp_fixed_finish, retdest
+    %jump(mstore_unpacking)
 
 // Writes the RLP prefix for a string of the given length. This does not handle
 // the trivial encoding of certain single-byte strings, as handling that would
@@ -78,156 +67,156 @@ global doubly_encode_rlp_fixed:
 // length. This method should generally be used only when we know a string
 // contains at least two bytes.
 //
-// Pre stack: pos, str_len, retdest
-// Post stack: pos'
+// Pre stack: rlp_addr, str_len, retdest
+// Post stack: rlp_addr'
 global encode_rlp_multi_byte_string_prefix:
-    // stack: pos, str_len, retdest
+    // stack: rlp_addr, str_len, retdest
     DUP2 %gt_const(55)
-    // stack: str_len > 55, pos, str_len, retdest
+    // stack: str_len > 55, rlp_addr, str_len, retdest
     %jumpi(encode_rlp_multi_byte_string_prefix_large)
     // Medium case; prefix is 0x80 + str_len.
-    // stack: pos, str_len, retdest
-    SWAP1 %add_const(0x80)
-    // stack: prefix, pos, retdest
+    // stack: rlp_addr, str_len, retdest
+    PUSH 0x80
     DUP2
-    // stack: pos, prefix, pos, retdest
-    %mstore_rlp
-    // stack: pos, retdest
+    // stack: rlp_addr, 0x80, rlp_addr, str_len, retdest
+    SWAP3 ADD
+    // stack: prefix, rlp_addr, rlp_addr, retdest
+    MSTORE_GENERAL
+    // stack: rlp_addr, retdest
     %increment
-    // stack: pos', retdest
+    // stack: rlp_addr', retdest
     SWAP1
     JUMP
 encode_rlp_multi_byte_string_prefix_large:
     // Large case; prefix is 0xb7 + len_of_len, followed by str_len.
-    // stack: pos, str_len, retdest
+    // stack: rlp_addr, str_len, retdest
     DUP2
     %num_bytes
-    // stack: len_of_len, pos, str_len, retdest
+    // stack: len_of_len, rlp_addr, str_len, retdest
     SWAP1
-    DUP2 // len_of_len
+    DUP1 // rlp_addr
+    DUP3 // len_of_len
     %add_const(0xb7)
-    // stack: first_byte, pos, len_of_len, str_len, retdest
-    DUP2
-    // stack: pos, first_byte, pos, len_of_len, str_len, retdest
-    %mstore_rlp
-    // stack: pos, len_of_len, str_len, retdest
+    // stack: first_byte, rlp_addr, rlp_addr, len_of_len, str_len, retdest
+    MSTORE_GENERAL
+    // stack: rlp_addr, len_of_len, str_len, retdest
     %increment
-    // stack: pos', len_of_len, str_len, retdest
-    %stack (pos, len_of_len, str_len) -> (pos, str_len, len_of_len)
-    %jump(mstore_unpacking_rlp)
+    // stack: rlp_addr', len_of_len, str_len, retdest
+    %stack (rlp_addr, len_of_len, str_len) -> (rlp_addr, str_len, len_of_len)
+    %jump(mstore_unpacking)
 
 %macro encode_rlp_multi_byte_string_prefix
-    %stack (pos, str_len) -> (pos, str_len, %%after)
+    %stack (rlp_addr, str_len) -> (rlp_addr, str_len, %%after)
     %jump(encode_rlp_multi_byte_string_prefix)
 %%after:
 %endmacro
 
 // Writes the RLP prefix for a list with the given payload length.
 //
-// Pre stack: pos, payload_len, retdest
-// Post stack: pos'
+// Pre stack: rlp_addr, payload_len, retdest
+// Post stack: rlp_addr'
 global encode_rlp_list_prefix:
-    // stack: pos, payload_len, retdest
+    // stack: rlp_addr, payload_len, retdest
     DUP2 %gt_const(55)
     %jumpi(encode_rlp_list_prefix_large)
     // Small case: prefix is just 0xc0 + length.
-    // stack: pos, payload_len, retdest
-    SWAP1
+    // stack: rlp_addr, payload_len, retdest
+    DUP1
+    SWAP2
     %add_const(0xc0)
-    // stack: prefix, pos, retdest
-    DUP2
-    // stack: pos, prefix, pos, retdest
-    %mstore_rlp
-    // stack: pos, retdest
+    // stack: prefix, rlp_addr, rlp_addr, retdest
+    MSTORE_GENERAL
+    // stack: rlp_addr, retdest
     %increment
     SWAP1
     JUMP
 encode_rlp_list_prefix_large:
     // Write 0xf7 + len_of_len.
-    // stack: pos, payload_len, retdest
+    // stack: rlp_addr, payload_len, retdest
     DUP2 %num_bytes
-    // stack: len_of_len, pos, payload_len, retdest
-    DUP1 %add_const(0xf7)
-    // stack: first_byte, len_of_len, pos, payload_len, retdest
-    DUP3 // pos
-    %mstore_rlp
-    // stack: len_of_len, pos, payload_len, retdest
+    // stack: len_of_len, rlp_addr, payload_len, retdest
+    DUP2
+    DUP2 %add_const(0xf7)
+    // stack: first_byte, rlp_addr, len_of_len, rlp_addr, payload_len, retdest
+    MSTORE_GENERAL
+    // stack: len_of_len, rlp_addr, payload_len, retdest
     SWAP1 %increment
-    // stack: pos', len_of_len, payload_len, retdest
-    %stack (pos, len_of_len, payload_len)
-        -> (pos, payload_len, len_of_len,
+    // stack: rlp_addr', len_of_len, payload_len, retdest
+    %stack (rlp_addr, len_of_len, payload_len)
+        -> (rlp_addr, payload_len, len_of_len,
             encode_rlp_list_prefix_large_done_writing_len)
-    %jump(mstore_unpacking_rlp)
+    %jump(mstore_unpacking)
 encode_rlp_list_prefix_large_done_writing_len:
-    // stack: pos'', retdest
+    // stack: rlp_addr'', retdest
     SWAP1
     JUMP
 
 %macro encode_rlp_list_prefix
-    %stack (pos, payload_len) -> (pos, payload_len, %%after)
+    %stack (rlp_addr, payload_len) -> (rlp_addr, payload_len, %%after)
     %jump(encode_rlp_list_prefix)
 %%after:
 %endmacro
 
-// Given an RLP list payload which starts and ends at the given positions,
-// prepend the appropriate RLP list prefix. Returns the updated start position,
+// Given an RLP list payload which starts and ends at the given rlp_address,
+// prepend the appropriate RLP list prefix. Returns the updated start rlp_address,
 // as well as the length of the RLP data (including the newly-added prefix).
 //
-// Pre stack: end_pos, start_pos, retdest
-// Post stack: prefix_start_pos, rlp_len
+// Pre stack: end_rlp_addr, start_rlp_addr, retdest
+// Post stack: prefix_start_rlp_addr, rlp_len
 global prepend_rlp_list_prefix:
-    // stack: end_pos, start_pos, retdest
-    DUP2 DUP2 SUB // end_pos - start_pos
-    // stack: payload_len, end_pos, start_pos, retdest
+    // stack: end_rlp_addr, start_rlp_addr, retdest
+    DUP2 DUP2 SUB // end_rlp_addr - start_rlp_addr
+    // stack: payload_len, end_rlp_addr, start_rlp_addr, retdest
     DUP1 %gt_const(55)
     %jumpi(prepend_rlp_list_prefix_big)
 
-    // If we got here, we have a small list, so we prepend 0xc0 + len at position 8.
-    // stack: payload_len, end_pos, start_pos, retdest
-    DUP1 %add_const(0xc0)
-    // stack: prefix_byte, payload_len, end_pos, start_pos, retdest
-    DUP4 %decrement // offset of prefix
-    %mstore_rlp
-    // stack: payload_len, end_pos, start_pos, retdest
+    // If we got here, we have a small list, so we prepend 0xc0 + len at rlp_address 8.
+    // stack: payload_len, end_rlp_addr, start_rlp_addr, retdest
+    PUSH 1 DUP4 SUB // offset of prefix
+    DUP2 %add_const(0xc0)
+    // stack: prefix_byte, start_rlp_addr-1, payload_len, end_rlp_addr, start_rlp_addr, retdest
+    MSTORE_GENERAL
+    // stack: payload_len, end_rlp_addr, start_rlp_addr, retdest
     %increment
-    // stack: rlp_len, end_pos, start_pos, retdest
+    // stack: rlp_len, end_rlp_addr, start_rlp_addr, retdest
     SWAP2 %decrement
-    // stack: prefix_start_pos, end_pos, rlp_len, retdest
-    %stack (prefix_start_pos, end_pos, rlp_len, retdest) -> (retdest, prefix_start_pos, rlp_len)
+    // stack: prefix_start_rlp_addr, end_rlp_addr, rlp_len, retdest
+    %stack (prefix_start_rlp_addr, end_rlp_addr, rlp_len, retdest) -> (retdest, prefix_start_rlp_addr, rlp_len)
     JUMP
 
 prepend_rlp_list_prefix_big:
-    // We have a large list, so we prepend 0xf7 + len_of_len at position
-    //     prefix_start_pos = start_pos - 1 - len_of_len
+    // We have a large list, so we prepend 0xf7 + len_of_len at rlp_address
+    //     prefix_start_rlp_addr = start_rlp_addr - 1 - len_of_len
     // followed by the length itself.
-    // stack: payload_len, end_pos, start_pos, retdest
+    // stack: payload_len, end_rlp_addr, start_rlp_addr, retdest
     DUP1 %num_bytes
-    // stack: len_of_len, payload_len, end_pos, start_pos, retdest
+    // stack: len_of_len, payload_len, end_rlp_addr, start_rlp_addr, retdest
     DUP1
-    DUP5 %decrement // start_pos - 1
+    PUSH 1 DUP6 SUB // start_rlp_addr - 1
     SUB
-    // stack: prefix_start_pos, len_of_len, payload_len, end_pos, start_pos, retdest
-    DUP2 %add_const(0xf7) DUP2 %mstore_rlp // rlp[prefix_start_pos] = 0xf7 + len_of_len
-    // stack: prefix_start_pos, len_of_len, payload_len, end_pos, start_pos, retdest
-    DUP1 %increment // start_len_pos = prefix_start_pos + 1
-    %stack (start_len_pos, prefix_start_pos, len_of_len, payload_len, end_pos, start_pos, retdest)
-        -> (start_len_pos, payload_len, len_of_len,
+    // stack: prefix_start_rlp_addr, len_of_len, payload_len, end_rlp_addr, start_rlp_addr, retdest
+    DUP1
+    DUP3 %add_const(0xf7) MSTORE_GENERAL // rlp[prefix_start_rlp_addr] = 0xf7 + len_of_len
+    // stack: prefix_start_rlp_addr, len_of_len, payload_len, end_rlp_addr, start_rlp_addr, retdest
+    DUP1 %increment // start_len_rlp_addr = prefix_start_rlp_addr + 1
+    %stack (start_len_rlp_addr, prefix_start_rlp_addr, len_of_len, payload_len, end_rlp_addr, start_rlp_addr, retdest)
+        -> (start_len_rlp_addr, payload_len, len_of_len,
             prepend_rlp_list_prefix_big_done_writing_len,
-            prefix_start_pos, end_pos, retdest)
-    %jump(mstore_unpacking_rlp)
+            prefix_start_rlp_addr, end_rlp_addr, retdest)
+    %jump(mstore_unpacking)
 prepend_rlp_list_prefix_big_done_writing_len:
-    // stack: start_pos, prefix_start_pos, end_pos, retdest
-    %stack (start_pos, prefix_start_pos, end_pos)
-        -> (end_pos, prefix_start_pos, prefix_start_pos)
-    // stack: end_pos, prefix_start_pos, prefix_start_pos, retdest
+    // stack: start_rlp_addr, prefix_start_rlp_addr, end_rlp_addr, retdest
+    %stack (start_rlp_addr, prefix_start_rlp_addr, end_rlp_addr)
+        -> (end_rlp_addr, prefix_start_rlp_addr, prefix_start_rlp_addr)
+    // stack: end_rlp_addr, prefix_start_rlp_addr, prefix_start_rlp_addr, retdest
     SUB
-    // stack: rlp_len, prefix_start_pos, retdest
-    %stack (rlp_len, prefix_start_pos, retdest) -> (retdest, prefix_start_pos, rlp_len)
+    // stack: rlp_len, prefix_start_rlp_addr, retdest
+    %stack (rlp_len, prefix_start_rlp_addr, retdest) -> (retdest, prefix_start_rlp_addr, rlp_len)
     JUMP
 
 // Convenience macro to call prepend_rlp_list_prefix and return where we left off.
 %macro prepend_rlp_list_prefix
-    %stack (end_pos, start_pos) -> (end_pos, start_pos, %%after)
+    %stack (end_rlp_addr, start_rlp_addr) -> (end_rlp_addr, start_rlp_addr, %%after)
     %jump(prepend_rlp_list_prefix)
 %%after:
 %endmacro
@@ -274,12 +263,3 @@ prepend_rlp_list_prefix_big_done_writing_len:
     ADD
 %%finish:
 %endmacro
-
-// Like mstore_unpacking, but specifically for the RLP segment.
-// Pre stack: offset, value, len, retdest
-// Post stack: offset'
-global mstore_unpacking_rlp:
-    // stack: offset, value, len, retdest
-    PUSH @SEGMENT_RLP_RAW
-    PUSH 0 // context
-    %jump(mstore_unpacking)
diff --git a/evm/src/cpu/kernel/asm/rlp/encode_rlp_scalar.asm b/evm/src/cpu/kernel/asm/rlp/encode_rlp_scalar.asm
index cd4a837e31..d311a57ebc 100644
--- a/evm/src/cpu/kernel/asm/rlp/encode_rlp_scalar.asm
+++ b/evm/src/cpu/kernel/asm/rlp/encode_rlp_scalar.asm
@@ -1,8 +1,8 @@
 // RLP-encode a scalar, i.e. a variable-length integer.
-// Pre stack: pos, scalar, retdest
-// Post stack: pos
+// Pre stack: rlp_addr, scalar, retdest
+// Post stack: rlp_addr
 global encode_rlp_scalar:
-    // stack: pos, scalar, retdest
+    // stack: rlp_addr, scalar, retdest
     // If scalar > 0x7f, this is the "medium" case.
     DUP2
     %gt_const(0x7f)
@@ -12,12 +12,12 @@ global encode_rlp_scalar:
     DUP2 %jumpi(encode_rlp_scalar_small)
 
     // scalar = 0, so BE(scalar) is the empty string, which RLP encodes as a single byte 0x80.
-    // stack: pos, scalar, retdest
-    %stack (pos, scalar) -> (pos, 0x80, pos)
-    %mstore_rlp
-    // stack: pos, retdest
+    // stack: rlp_addr, scalar, retdest
+    %stack (rlp_addr, scalar) -> (0x80, rlp_addr, rlp_addr)
+    MSTORE_GENERAL
+    // stack: rlp_addr, retdest
     %increment
-    // stack: pos', retdest
+    // stack: rlp_addr', retdest
     SWAP1
     JUMP
 
@@ -26,17 +26,17 @@ encode_rlp_scalar_medium:
     // (big-endian) scalar bytes. We first compute the minimal number of bytes
     // needed to represent this scalar, then treat it as if it was a fixed-
     // length string with that length.
-    // stack: pos, scalar, retdest
+    // stack: rlp_addr, scalar, retdest
     DUP2
     %num_bytes
-    // stack: scalar_bytes, pos, scalar, retdest
+    // stack: scalar_bytes, rlp_addr, scalar, retdest
     %jump(encode_rlp_fixed)
 
 // Doubly-RLP-encode a scalar, i.e. return encode(encode(scalar)).
-// Pre stack: pos, scalar, retdest
-// Post stack: pos
+// Pre stack: rlp_addr, scalar, retdest
+// Post stack: rlp_addr
 global doubly_encode_rlp_scalar:
-    // stack: pos, scalar, retdest
+    // stack: rlp_addr, scalar, retdest
     // If scalar > 0x7f, this is the "medium" case.
     DUP2
     %gt_const(0x7f)
@@ -46,15 +46,16 @@ global doubly_encode_rlp_scalar:
     DUP2 %jumpi(encode_rlp_scalar_small)
 
     // scalar = 0, so BE(scalar) is the empty string, encode(scalar) = 0x80, and encode(encode(scalar)) = 0x8180.
-    // stack: pos, scalar, retdest
-    %stack (pos, scalar) -> (pos, 0x81, pos, 0x80, pos)
-    %mstore_rlp
-    // stack: pos, 0x80, pos, retdest
+    // stack: rlp_addr, scalar, retdest
+    %stack (rlp_addr, scalar) -> (0x81, rlp_addr, rlp_addr)
+    MSTORE_GENERAL
+    // stack: rlp_addr, retdest
     %increment
-    %mstore_rlp
-    // stack: pos, retdest
-    %add_const(2)
-    // stack: pos, retdest
+    DUP1 PUSH 0x80
+    MSTORE_GENERAL
+    // stack: rlp_addr, retdest
+    %increment
+    // stack: rlp_addr, retdest
     SWAP1
     JUMP
 
@@ -65,35 +66,43 @@ doubly_encode_rlp_scalar_medium:
     //     encode(encode(scalar)) = [0x80 + len + 1] || [0x80 + len] || BE(scalar)
     // We first compute the length of the scalar with %num_bytes, then treat the scalar as if it was a
     // fixed-length string with that length.
-    // stack: pos, scalar, retdest
+    // stack: rlp_addr, scalar, retdest
     DUP2
     %num_bytes
-    // stack: scalar_bytes, pos, scalar, retdest
+    // stack: scalar_bytes, rlp_addr, scalar, retdest
     %jump(doubly_encode_rlp_fixed)
 
 // The "small" case of RLP-encoding a scalar, where the value is its own encoding.
 // This can be used for both for singly encoding or doubly encoding, since encode(encode(x)) = encode(x) = x.
 encode_rlp_scalar_small:
-    // stack: pos, scalar, retdest
-    %stack (pos, scalar) -> (pos, scalar, pos)
-    // stack: pos, scalar, pos, retdest
-    %mstore_rlp
-    // stack: pos, retdest
+    // stack: rlp_addr, scalar, retdest
+    %stack (rlp_addr, scalar) -> (scalar, rlp_addr, rlp_addr)
+    // stack: scalar, rlp_addr, rlp_addr, retdest
+    MSTORE_GENERAL
+    // stack: rlp_addr, retdest
     %increment
-    // stack: pos', retdest
+    // stack: rlp_addr', retdest
     SWAP1
     JUMP
 
+// Convenience macro to call encode_rlp_scalar and return where we left off.
+// It takes swapped inputs, i.e. `scalar, rlp_addr` instead of `rlp_addr, scalar`.
+%macro encode_rlp_scalar_swapped_inputs
+    %stack (scalar, rlp_addr) -> (rlp_addr, scalar, %%after)
+    %jump(encode_rlp_scalar)
+%%after:
+%endmacro
+
 // Convenience macro to call encode_rlp_scalar and return where we left off.
 %macro encode_rlp_scalar
-    %stack (pos, scalar) -> (pos, scalar, %%after)
+    %stack (rlp_addr, scalar) -> (rlp_addr, scalar, %%after)
     %jump(encode_rlp_scalar)
 %%after:
 %endmacro
 
 // Convenience macro to call doubly_encode_rlp_scalar and return where we left off.
 %macro doubly_encode_rlp_scalar
-    %stack (pos, scalar) -> (pos, scalar, %%after)
+    %stack (rlp_addr, scalar) -> (rlp_addr, scalar, %%after)
     %jump(doubly_encode_rlp_scalar)
 %%after:
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/rlp/encode_rlp_string.asm b/evm/src/cpu/kernel/asm/rlp/encode_rlp_string.asm
index 1065c61209..60174a9436 100644
--- a/evm/src/cpu/kernel/asm/rlp/encode_rlp_string.asm
+++ b/evm/src/cpu/kernel/asm/rlp/encode_rlp_string.asm
@@ -1,80 +1,79 @@
 // Encodes an arbitrary string, given a pointer and length.
-// Pre stack: pos, ADDR: 3, len, retdest
-// Post stack: pos'
+// Pre stack: rlp_addr, ADDR, len, retdest
+// Post stack: rlp_addr'
 global encode_rlp_string:
-    // stack: pos, ADDR: 3, len, retdest
-    DUP5 %eq_const(1)
-    // stack: len == 1, pos, ADDR: 3, len, retdest
-    DUP5 DUP5 DUP5 // ADDR: 3
+    // stack: rlp_addr, ADDR, len, retdest
+    DUP3 %eq_const(1)
+    // stack: len == 1, rlp_addr, ADDR, len, retdest
+    DUP3
     MLOAD_GENERAL
-    // stack: first_byte, len == 1, pos, ADDR: 3, len, retdest
+    // stack: first_byte, len == 1, rlp_addr, ADDR, len, retdest
     %lt_const(128)
     MUL // cheaper than AND
-    // stack: single_small_byte, pos, ADDR: 3, len, retdest
+    // stack: single_small_byte, rlp_addr, ADDR, len, retdest
     %jumpi(encode_rlp_string_small_single_byte)
 
-    // stack: pos, ADDR: 3, len, retdest
-    DUP5 %gt_const(55)
-    // stack: len > 55, pos, ADDR: 3, len, retdest
+    // stack: rlp_addr, ADDR, len, retdest
+    DUP3 %gt_const(55)
+    // stack: len > 55, rlp_addr, ADDR, len, retdest
     %jumpi(encode_rlp_string_large)
 
 global encode_rlp_string_small:
-    // stack: pos, ADDR: 3, len, retdest
-    DUP5 // len
+    // stack: rlp_addr, ADDR, len, retdest
+    DUP1
+    DUP4 // len
     %add_const(0x80)
-    // stack: first_byte, pos, ADDR: 3, len, retdest
-    DUP2
-    // stack: pos, first_byte, pos, ADDR: 3, len, retdest
-    %mstore_rlp
-    // stack: pos, ADDR: 3, len, retdest
+    // stack: first_byte, rlp_addr, rlp_addr, ADDR, len, retdest
+    MSTORE_GENERAL
+    // stack: rlp_addr, ADDR, len, retdest
     %increment
-    // stack: pos', ADDR: 3, len, retdest
-    DUP5 DUP2 ADD // pos'' = pos' + len
-    // stack: pos'', pos', ADDR: 3, len, retdest
-    %stack (pos2, pos1, ADDR: 3, len, retdest)
-        -> (0, @SEGMENT_RLP_RAW, pos1, ADDR, len, retdest, pos2)
-    %jump(memcpy)
+    // stack: rlp_addr', ADDR, len, retdest
+    DUP3 DUP2 ADD // rlp_addr'' = rlp_addr' + len
+    // stack: rlp_addr'', rlp_addr', ADDR, len, retdest
+    %stack (rlp_addr2, rlp_addr1, ADDR, len, retdest)
+        -> (rlp_addr1, ADDR, len, retdest, rlp_addr2)
+    %jump(memcpy_bytes)
 
 global encode_rlp_string_small_single_byte:
-    // stack: pos, ADDR: 3, len, retdest
-    %stack (pos, ADDR: 3, len) -> (ADDR, pos)
+    // stack: rlp_addr, ADDR, len, retdest
+    %stack (rlp_addr, ADDR, len) -> (ADDR, rlp_addr)
     MLOAD_GENERAL
-    // stack: byte, pos, retdest
-    DUP2
-    %mstore_rlp
-    // stack: pos, retdest
+    // stack: byte, rlp_addr, retdest
+    DUP2 SWAP1
+    MSTORE_GENERAL
+    // stack: rlp_addr, retdest
     %increment
     SWAP1
-    // stack: retdest, pos'
+    // stack: retdest, rlp_addr'
     JUMP
 
 global encode_rlp_string_large:
-    // stack: pos, ADDR: 3, len, retdest
-    DUP5 %num_bytes
-    // stack: len_of_len, pos, ADDR: 3, len, retdest
+    // stack: rlp_addr, ADDR, len, retdest
+    DUP3 %num_bytes
+    // stack: len_of_len, rlp_addr, ADDR, len, retdest
     SWAP1
-    DUP2 // len_of_len
+    DUP1
+    // stack: rlp_addr, rlp_addr, len_of_len, ADDR, len, retdest
+    DUP3 // len_of_len
     %add_const(0xb7)
-    // stack: first_byte, pos, len_of_len, ADDR: 3, len, retdest
-    DUP2
-    // stack: pos, first_byte, pos, len_of_len, ADDR: 3, len, retdest
-    %mstore_rlp
-    // stack: pos, len_of_len, ADDR: 3, len, retdest
+    // stack: first_byte, rlp_addr, rlp_addr, len_of_len, ADDR, len, retdest
+    MSTORE_GENERAL
+    // stack: rlp_addr, len_of_len, ADDR, len, retdest
     %increment
-    // stack: pos', len_of_len, ADDR: 3, len, retdest
-    %stack (pos, len_of_len, ADDR: 3, len)
-        -> (pos, len, len_of_len, encode_rlp_string_large_after_writing_len, ADDR, len)
-    %jump(mstore_unpacking_rlp)
+    // stack: rlp_addr', len_of_len, ADDR, len, retdest
+    %stack (rlp_addr, len_of_len, ADDR, len)
+        -> (rlp_addr, len, len_of_len, encode_rlp_string_large_after_writing_len, ADDR, len)
+    %jump(mstore_unpacking)
 global encode_rlp_string_large_after_writing_len:
-    // stack: pos'', ADDR: 3, len, retdest
-    DUP5 DUP2 ADD // pos''' = pos'' + len
-    // stack: pos''', pos'', ADDR: 3, len, retdest
-    %stack (pos3, pos2, ADDR: 3, len, retdest)
-        -> (0, @SEGMENT_RLP_RAW, pos2, ADDR, len, retdest, pos3)
-    %jump(memcpy)
+    // stack: rlp_addr'', ADDR, len, retdest
+    DUP3 DUP2 ADD // rlp_addr''' = rlp_addr'' + len
+    // stack: rlp_addr''', rlp_addr'', ADDR, len, retdest
+    %stack (rlp_addr3, rlp_addr2, ADDR, len, retdest)
+        -> (rlp_addr2, ADDR, len, retdest, rlp_addr3)
+    %jump(memcpy_bytes)
 
 %macro encode_rlp_string
-    %stack (pos, ADDR: 3, len) -> (pos, ADDR, len, %%after)
+    %stack (rlp_addr, ADDR, len) -> (rlp_addr, ADDR, len, %%after)
     %jump(encode_rlp_string)
 %%after:
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/rlp/increment_bounded_rlp.asm b/evm/src/cpu/kernel/asm/rlp/increment_bounded_rlp.asm
index 2e76c20f8f..6958cff9f8 100644
--- a/evm/src/cpu/kernel/asm/rlp/increment_bounded_rlp.asm
+++ b/evm/src/cpu/kernel/asm/rlp/increment_bounded_rlp.asm
@@ -2,8 +2,8 @@
 // its number of nibbles when required. Shouldn't be
 // called with rlp_index > 0x82 ff ff
 global increment_bounded_rlp:
-    // stack: rlp_index, num_nibbles, retdest
-    DUP1
+    // stack: num_nibbles, rlp_index, retdest
+    DUP2
     %eq_const(0x80)
     %jumpi(case_0x80)
     DUP1
@@ -14,19 +14,19 @@ global increment_bounded_rlp:
     %jumpi(case_0x81ff)
     // If rlp_index != 0x80 and rlp_index != 0x7f and rlp_index != 0x81ff
     // we only need to add one and keep the number of nibbles
-    %increment
-    %stack (rlp_index, num_nibbles, retdest) -> (retdest, rlp_index, num_nibbles)
+    DUP2 %increment DUP2
+    %stack (next_num_nibbles, next_rlp_index, num_nibbles, rlp_index, retdest) -> (retdest, rlp_index, num_nibbles, next_rlp_index, next_num_nibbles)
     JUMP
 
 case_0x80:
-    %stack (rlp_index, num_nibbles, retdest) -> (retdest, 0x01, 2)
+    %stack (num_nibbles, rlp_index, retdest) -> (retdest, 0x80, 2, 0x01, 2)
     JUMP
 case_0x7f:
-    %stack (rlp_index, num_nibbles, retdest) -> (retdest, 0x8180, 4)
+    %stack (num_nibbles, rlp_index, retdest) -> (retdest, 0x7f, 2, 0x8180, 4)
     JUMP
 
 case_0x81ff:
-    %stack (rlp_index, num_nibbles, retdest) -> (retdest, 0x820100, 6)
+    %stack (num_nibbles, rlp_index, retdest) -> (retdest, 0x81ff, 4, 0x820100, 6)
     JUMP
     
     
diff --git a/evm/src/cpu/kernel/asm/rlp/num_bytes.asm b/evm/src/cpu/kernel/asm/rlp/num_bytes.asm
index a242f14784..de0a7ca966 100644
--- a/evm/src/cpu/kernel/asm/rlp/num_bytes.asm
+++ b/evm/src/cpu/kernel/asm/rlp/num_bytes.asm
@@ -1,78 +1,26 @@
 // Get the number of bytes required to represent the given scalar.
 // Note that we define num_bytes(0) to be 1.
-
 global num_bytes:
     // stack: x, retdest
-    DUP1 PUSH  0 BYTE %jumpi(return_32)
-    DUP1 PUSH  1 BYTE %jumpi(return_31)
-    DUP1 PUSH  2 BYTE %jumpi(return_30)
-    DUP1 PUSH  3 BYTE %jumpi(return_29)
-    DUP1 PUSH  4 BYTE %jumpi(return_28)
-    DUP1 PUSH  5 BYTE %jumpi(return_27)
-    DUP1 PUSH  6 BYTE %jumpi(return_26)
-    DUP1 PUSH  7 BYTE %jumpi(return_25)
-    DUP1 PUSH  8 BYTE %jumpi(return_24)
-    DUP1 PUSH  9 BYTE %jumpi(return_23)
-    DUP1 PUSH 10 BYTE %jumpi(return_22)
-    DUP1 PUSH 11 BYTE %jumpi(return_21)
-    DUP1 PUSH 12 BYTE %jumpi(return_20)
-    DUP1 PUSH 13 BYTE %jumpi(return_19)
-    DUP1 PUSH 14 BYTE %jumpi(return_18)
-    DUP1 PUSH 15 BYTE %jumpi(return_17)
-    DUP1 PUSH 16 BYTE %jumpi(return_16)
-    DUP1 PUSH 17 BYTE %jumpi(return_15)
-    DUP1 PUSH 18 BYTE %jumpi(return_14)
-    DUP1 PUSH 19 BYTE %jumpi(return_13)
-    DUP1 PUSH 20 BYTE %jumpi(return_12)
-    DUP1 PUSH 21 BYTE %jumpi(return_11)
-    DUP1 PUSH 22 BYTE %jumpi(return_10)
-    DUP1 PUSH 23 BYTE %jumpi(return_9)
-    DUP1 PUSH 24 BYTE %jumpi(return_8)
-    DUP1 PUSH 25 BYTE %jumpi(return_7)
-    DUP1 PUSH 26 BYTE %jumpi(return_6)
-    DUP1 PUSH 27 BYTE %jumpi(return_5)
-    DUP1 PUSH 28 BYTE %jumpi(return_4)
-    DUP1 PUSH 29 BYTE %jumpi(return_3)
-         PUSH 30 BYTE %jumpi(return_2)
+    DUP1 ISZERO %jumpi(return_1)
+    // Non-deterministically guess the number of bits
+    PROVER_INPUT(num_bits)
+    %stack(num_bits, x) -> (num_bits, 1, x, num_bits)
+    SUB
+    SHR
+    // stack: 1, num_bits
+    %assert_eq_const(1)
+    // convert number of bits to number of bytes
+    %add_const(7)
+    %shr_const(3)
 
-    // If we got all the way here, each byte was zero, except possibly the least
-    // significant byte, which we didn't check. Either way, the result is 1.
-    // stack: retdest
-    PUSH 1
     SWAP1
     JUMP
 
-return_2:      PUSH  2 SWAP1 JUMP
-return_3:  POP PUSH  3 SWAP1 JUMP
-return_4:  POP PUSH  4 SWAP1 JUMP
-return_5:  POP PUSH  5 SWAP1 JUMP
-return_6:  POP PUSH  6 SWAP1 JUMP
-return_7:  POP PUSH  7 SWAP1 JUMP
-return_8:  POP PUSH  8 SWAP1 JUMP
-return_9:  POP PUSH  9 SWAP1 JUMP
-return_10: POP PUSH 10 SWAP1 JUMP
-return_11: POP PUSH 11 SWAP1 JUMP
-return_12: POP PUSH 12 SWAP1 JUMP
-return_13: POP PUSH 13 SWAP1 JUMP
-return_14: POP PUSH 14 SWAP1 JUMP
-return_15: POP PUSH 15 SWAP1 JUMP
-return_16: POP PUSH 16 SWAP1 JUMP
-return_17: POP PUSH 17 SWAP1 JUMP
-return_18: POP PUSH 18 SWAP1 JUMP
-return_19: POP PUSH 19 SWAP1 JUMP
-return_20: POP PUSH 20 SWAP1 JUMP
-return_21: POP PUSH 21 SWAP1 JUMP
-return_22: POP PUSH 22 SWAP1 JUMP
-return_23: POP PUSH 23 SWAP1 JUMP
-return_24: POP PUSH 24 SWAP1 JUMP
-return_25: POP PUSH 25 SWAP1 JUMP
-return_26: POP PUSH 26 SWAP1 JUMP
-return_27: POP PUSH 27 SWAP1 JUMP
-return_28: POP PUSH 28 SWAP1 JUMP
-return_29: POP PUSH 29 SWAP1 JUMP
-return_30: POP PUSH 30 SWAP1 JUMP
-return_31: POP PUSH 31 SWAP1 JUMP
-return_32: POP PUSH 32 SWAP1 JUMP
+return_1:
+    // stack: x, retdest
+    %stack(x, retdest) -> (retdest, 1)
+    JUMP
 
 // Convenience macro to call num_bytes and return where we left off.
 %macro num_bytes
diff --git a/evm/src/cpu/kernel/asm/rlp/read_to_memory.asm b/evm/src/cpu/kernel/asm/rlp/read_to_memory.asm
index 85a7817522..8070fd0beb 100644
--- a/evm/src/cpu/kernel/asm/rlp/read_to_memory.asm
+++ b/evm/src/cpu/kernel/asm/rlp/read_to_memory.asm
@@ -2,35 +2,37 @@
 // segment of memory.
 
 // Pre stack: retdest
-// Post stack: (empty)
+// Post stack: txn_rlp_len
 
 global read_rlp_to_memory:
     // stack: retdest
     PROVER_INPUT(rlp) // Read the RLP blob length from the prover tape.
     // stack: len, retdest
-    PUSH 0 // initial position
-    // stack: pos, len, retdest
+    PUSH @SEGMENT_RLP_RAW
+    %build_kernel_address
 
+    PUSH @SEGMENT_RLP_RAW // ctx == virt == 0
+    // stack: addr, final_addr, retdest
 read_rlp_to_memory_loop:
-    // stack: pos, len, retdest
+    // stack: addr, final_addr, retdest
     DUP2
     DUP2
-    EQ
-    // stack: pos == len, pos, len, retdest
+    LT
+    ISZERO
+    // stack: addr >= final_addr, addr, final_addr, retdest
     %jumpi(read_rlp_to_memory_finish)
-    // stack: pos, len, retdest
+    // stack: addr, final_addr, retdest
     PROVER_INPUT(rlp)
-    // stack: byte, pos, len, retdest
-    DUP2
-    // stack: pos, byte, pos, len, retdest
-    %mstore_kernel(@SEGMENT_RLP_RAW)
-    // stack: pos, len, retdest
-    %increment
-    // stack: pos', len, retdest
+    SWAP1
+    MSTORE_32BYTES_32
+    // stack: addr', final_addr, retdest
     %jump(read_rlp_to_memory_loop)
 
 read_rlp_to_memory_finish:
-    // stack: pos, len, retdest
-    POP
-    // stack: len, retdest
-    SWAP1 JUMP
+    // stack: addr, final_addr, retdest
+    // we recover the offset here
+    PUSH @SEGMENT_RLP_RAW // ctx == virt == 0
+    DUP3 SUB
+    // stack: pos, addr, final_addr, retdest
+    %stack(pos, addr, final_addr, retdest) -> (retdest, pos)
+    JUMP
\ No newline at end of file
diff --git a/evm/src/cpu/kernel/asm/shift.asm b/evm/src/cpu/kernel/asm/shift.asm
index ce481ea2a1..ee9ccbfaea 100644
--- a/evm/src/cpu/kernel/asm/shift.asm
+++ b/evm/src/cpu/kernel/asm/shift.asm
@@ -2,22 +2,17 @@
 ///
 /// Specifically, set SHIFT_TABLE_SEGMENT[i] = 2^i for i = 0..255.
 %macro shift_table_init
+    push @SEGMENT_SHIFT_TABLE  // segment, ctx == virt == 0
     push 1                     // 2^0
-    push 0                     // initial offset is zero
-    push @SEGMENT_SHIFT_TABLE  // segment
-    dup2                       // kernel context is 0
     %rep 255
-        // stack: context, segment, ost_i, 2^i
-        dup4
+        // stack: 2^i, addr_i
+        dup2
+        %increment
+        // stack: addr_(i+1), 2^i, addr_i
+        dup2
         dup1
         add
-        // stack: 2^(i+1), context, segment, ost_i, 2^i
-        dup4
-        %increment
-        // stack: ost_(i+1), 2^(i+1), context, segment, ost_i, 2^i
-        dup4
-        dup4
-        // stack: context, segment, ost_(i+1), 2^(i+1), context, segment, ost_i, 2^i
+        // stack: 2^(i+1), addr_(i+1), 2^i, addr_i
     %endrep
     %rep 256
         mstore_general
diff --git a/evm/src/cpu/kernel/asm/transactions/common_decoding.asm b/evm/src/cpu/kernel/asm/transactions/common_decoding.asm
index 9b12d9c931..4a8feccaa3 100644
--- a/evm/src/cpu/kernel/asm/transactions/common_decoding.asm
+++ b/evm/src/cpu/kernel/asm/transactions/common_decoding.asm
@@ -6,207 +6,206 @@
 
 // Decode the chain ID and store it.
 %macro decode_and_store_chain_id
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, chain_id) -> (chain_id, pos)
+    %stack (rlp_addr, chain_id) -> (chain_id, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_CHAIN_ID)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 // Decode the nonce and store it.
 %macro decode_and_store_nonce
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, nonce) -> (nonce, pos)
+    %stack (rlp_addr, nonce) -> (nonce, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_NONCE)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 // Decode the gas price and, since this is for legacy txns, store it as both
 // TXN_FIELD_MAX_PRIORITY_FEE_PER_GAS and TXN_FIELD_MAX_FEE_PER_GAS.
 %macro decode_and_store_gas_price_legacy
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, gas_price) -> (gas_price, gas_price, pos)
+    %stack (rlp_addr, gas_price) -> (gas_price, gas_price, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_MAX_PRIORITY_FEE_PER_GAS)
     %mstore_txn_field(@TXN_FIELD_MAX_FEE_PER_GAS)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 // Decode the max priority fee and store it.
 %macro decode_and_store_max_priority_fee
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, gas_price) -> (gas_price, pos)
+    %stack (rlp_addr, gas_price) -> (gas_price, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_MAX_PRIORITY_FEE_PER_GAS)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 // Decode the max fee and store it.
 %macro decode_and_store_max_fee
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, gas_price) -> (gas_price, pos)
+    %stack (rlp_addr, gas_price) -> (gas_price, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_MAX_FEE_PER_GAS)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 // Decode the gas limit and store it.
 %macro decode_and_store_gas_limit
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, gas_limit) -> (gas_limit, pos)
+    %stack (rlp_addr, gas_limit) -> (gas_limit, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_GAS_LIMIT)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 // Decode the "to" field and store it.
 // This field is either 160-bit or empty in the case of a contract creation txn.
 %macro decode_and_store_to
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_string_len
-    // stack: pos, len
+    // stack: rlp_addr, len
     SWAP1
-    // stack: len, pos
+    // stack: len, rlp_addr
     DUP1 ISZERO %jumpi(%%contract_creation)
-    // stack: len, pos
+    // stack: len, rlp_addr
     DUP1 %eq_const(20) ISZERO %jumpi(invalid_txn) // Address is 160-bit
-    %stack (len, pos) -> (pos, len, %%with_scalar)
+    %stack (len, rlp_addr) -> (rlp_addr, len, %%with_scalar)
     %jump(decode_int_given_len)
 %%with_scalar:
-    // stack: pos, int
+    // stack: rlp_addr, int
     SWAP1
     %mstore_txn_field(@TXN_FIELD_TO)
-    // stack: pos
+    // stack: rlp_addr
     %jump(%%end)
 %%contract_creation:
-    // stack: len, pos
+    // stack: len, rlp_addr
     POP
     PUSH 1 %mstore_global_metadata(@GLOBAL_METADATA_CONTRACT_CREATION)
-    // stack: pos
+    // stack: rlp_addr
 %%end:
 %endmacro
 
 // Decode the "value" field and store it.
 %macro decode_and_store_value
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, value) -> (value, pos)
+    %stack (rlp_addr, value) -> (value, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_VALUE)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 // Decode the calldata field, store its length in @TXN_FIELD_DATA_LEN, and copy it to @SEGMENT_TXN_DATA.
 %macro decode_and_store_data
-    // stack: pos
-    // Decode the data length, store it, and compute new_pos after any data.
+    // stack: rlp_addr
+    // Decode the data length, store it, and compute new_rlp_addr after any data.
     %decode_rlp_string_len
-    %stack (pos, data_len) -> (data_len, pos, data_len, pos, data_len)
+    %stack (rlp_addr, data_len) -> (data_len, rlp_addr, data_len, rlp_addr, data_len)
     %mstore_txn_field(@TXN_FIELD_DATA_LEN)
-    // stack: pos, data_len, pos, data_len
+    // stack: rlp_addr, data_len, rlp_addr, data_len
     ADD
-    // stack: new_pos, old_pos, data_len
+    // stack: new_rlp_addr, old_rlp_addr, data_len
 
     // Memcpy the txn data from @SEGMENT_RLP_RAW to @SEGMENT_TXN_DATA.
-    %stack (new_pos, old_pos, data_len) -> (old_pos, data_len, %%after, new_pos)
-    PUSH @SEGMENT_RLP_RAW
-    GET_CONTEXT
-    PUSH 0
+    %stack (new_rlp_addr, old_rlp_addr, data_len) -> (old_rlp_addr, data_len, %%after, new_rlp_addr)
+    // old_rlp_addr has context 0. We will call GET_CONTEXT and update it.
+    GET_CONTEXT ADD
     PUSH @SEGMENT_TXN_DATA
-    GET_CONTEXT
-    // stack: DST, SRC, data_len, %%after, new_pos
-    %jump(memcpy)
+    GET_CONTEXT ADD
+    // stack: DST, SRC, data_len, %%after, new_rlp_addr
+    %jump(memcpy_bytes)
 
 %%after:
-    // stack: new_pos
+    // stack: new_rlp_addr
 %endmacro
 
 %macro decode_and_store_access_list
-    // stack: pos
+    // stack: rlp_addr
     DUP1 %mstore_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_START)
     %decode_rlp_list_len
-    %stack (pos, len) -> (len, len, pos, %%after)
+    %stack (rlp_addr, len) -> (len, len, rlp_addr, %%after)
     %jumpi(decode_and_store_access_list)
-    // stack: len, pos, %%after
+    // stack: len, rlp_addr, %%after
     POP SWAP1 POP
-    // stack: pos
+    // stack: rlp_addr
     %mload_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_START) DUP2 SUB %mstore_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_LEN)
 %%after:
 %endmacro
 
 %macro decode_and_store_y_parity
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, y_parity) -> (y_parity, pos)
+    %stack (rlp_addr, y_parity) -> (y_parity, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_Y_PARITY)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 %macro decode_and_store_r
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, r) -> (r, pos)
+    %stack (rlp_addr, r) -> (r, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_R)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 %macro decode_and_store_s
-    // stack: pos
+    // stack: rlp_addr
     %decode_rlp_scalar
-    %stack (pos, s) -> (s, pos)
+    %stack (rlp_addr, s) -> (s, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_S)
-    // stack: pos
+    // stack: rlp_addr
 %endmacro
 
 
 // The access list is of the form `[[{20 bytes}, [{32 bytes}...]]...]`.
 global decode_and_store_access_list:
-    // stack: len, pos
+    // stack: len, rlp_addr
     DUP2 ADD
-    // stack: end_pos, pos
+    // stack: end_rlp_addr, rlp_addr
     // Store the RLP length.
     %mload_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_START) DUP2 SUB %mstore_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_LEN)
     SWAP1
 decode_and_store_access_list_loop:
-    // stack: pos, end_pos
+    // stack: rlp_addr, end_rlp_addr
     DUP2 DUP2 EQ %jumpi(decode_and_store_access_list_finish)
-    // stack: pos, end_pos
+    // stack: rlp_addr, end_rlp_addr
     %decode_rlp_list_len // Should be a list `[{20 bytes}, [{32 bytes}...]]`
-    // stack: pos, internal_len, end_pos
+    // stack: rlp_addr, internal_len, end_rlp_addr
     SWAP1 POP // We don't need the length of this list.
-    // stack: pos, end_pos
+    // stack: rlp_addr, end_rlp_addr
     %decode_rlp_scalar // Address // TODO: Should panic when address is not 20 bytes?
-    // stack: pos, addr, end_pos
+    // stack: rlp_addr, addr, end_rlp_addr
     SWAP1
-    // stack: addr, pos, end_pos
+    // stack: addr, rlp_addr, end_rlp_addr
     DUP1 %insert_accessed_addresses_no_return
-    // stack: addr, pos, end_pos
+    // stack: addr, rlp_addr, end_rlp_addr
     %add_address_cost
-    // stack: addr, pos, end_pos
+    // stack: addr, rlp_addr, end_rlp_addr
     SWAP1
-    // stack: pos, addr, end_pos
+    // stack: rlp_addr, addr, end_rlp_addr
     %decode_rlp_list_len // Should be a list of storage keys `[{32 bytes}...]`
-    // stack: pos, sk_len, addr, end_pos
+    // stack: rlp_addr, sk_len, addr, end_rlp_addr
     SWAP1 DUP2 ADD
-    // stack: sk_end_pos, pos, addr, end_pos
+    // stack: sk_end_rlp_addr, rlp_addr, addr, end_rlp_addr
     SWAP1
-    // stack: pos, sk_end_pos, addr, end_pos
+    // stack: rlp_addr, sk_end_rlp_addr, addr, end_rlp_addr
 sk_loop:
     DUP2 DUP2 EQ %jumpi(end_sk)
-    // stack: pos, sk_end_pos, addr, end_pos
+    // stack: rlp_addr, sk_end_rlp_addr, addr, end_rlp_addr
     %decode_rlp_scalar // Storage key // TODO: Should panic when key is not 32 bytes?
-    %stack (pos, key, sk_end_pos, addr, end_pos) ->
-        (addr, key, sk_loop_contd, pos, sk_end_pos, addr, end_pos)
+    %stack (rlp_addr, key, sk_end_rlp_addr, addr, end_rlp_addr) ->
+        (addr, key, sk_loop_contd, rlp_addr, sk_end_rlp_addr, addr, end_rlp_addr)
     %jump(insert_accessed_storage_keys_with_original_value)
 sk_loop_contd:
-    // stack: pos, sk_end_pos, addr, end_pos
+    // stack: rlp_addr, sk_end_rlp_addr, addr, end_rlp_addr
     %add_storage_key_cost
     %jump(sk_loop)
 end_sk:
-    %stack (pos, sk_end_pos, addr, end_pos) -> (pos, end_pos)
+    %stack (rlp_addr, sk_end_rlp_addr, addr, end_rlp_addr) -> (rlp_addr, end_rlp_addr)
     %jump(decode_and_store_access_list_loop)
 decode_and_store_access_list_finish:
-    %stack (pos, end_pos, retdest) -> (retdest, pos)
+    %stack (rlp_addr, end_rlp_addr, retdest) -> (retdest, rlp_addr)
     JUMP
 
 %macro add_address_cost
diff --git a/evm/src/cpu/kernel/asm/transactions/router.asm b/evm/src/cpu/kernel/asm/transactions/router.asm
index 109334dd0d..edabfbc43a 100644
--- a/evm/src/cpu/kernel/asm/transactions/router.asm
+++ b/evm/src/cpu/kernel/asm/transactions/router.asm
@@ -5,10 +5,8 @@
 global route_txn:
     // stack: txn_counter, num_nibbles, retdest
     // First load transaction data into memory, where it will be parsed.
-    PUSH read_txn_from_memory
-    SWAP2 SWAP1
-    PUSH update_txn_trie
-    // stack: update_txn_trie, tx_counter, num_nibbles, read_txn_from_memory, retdest
+    %stack(txn_counter, num_nibbles) -> (update_txn_trie, txn_counter, num_nibbles, read_txn_from_memory)
+    // stack: update_txn_trie, txn_counter, num_nibbles, read_txn_from_memory, retdest
     %jump(read_rlp_to_memory)
 
 // At this point, the raw txn data is in memory.
@@ -20,15 +18,15 @@ read_txn_from_memory:
     // Type 0 (legacy) transactions have no such prefix, but their RLP will have a
     // first byte >= 0xc0, so there is no overlap.
 
-    PUSH 0
-    %mload_kernel(@SEGMENT_RLP_RAW)
+    PUSH @SEGMENT_RLP_RAW // ctx == virt == 0
+    MLOAD_GENERAL
     %eq_const(1)
     // stack: first_byte == 1, retdest
     %jumpi(process_type_1_txn)
     // stack: retdest
 
-    PUSH 0
-    %mload_kernel(@SEGMENT_RLP_RAW)
+    PUSH @SEGMENT_RLP_RAW // ctx == virt == 0
+    MLOAD_GENERAL
     %eq_const(2)
     // stack: first_byte == 2, retdest
     %jumpi(process_type_2_txn)
@@ -53,10 +51,12 @@ global update_txn_trie:
 
     // and now copy txn_rlp to the new block
     %stack (rlp_start, txn_rlp_len, value_ptr, txn_counter, num_nibbles) -> (
-        0, @SEGMENT_TRIE_DATA, rlp_start, // dest addr
-        0, @SEGMENT_RLP_RAW, 0, // src addr. Kernel has context 0
+        @SEGMENT_RLP_RAW, // src addr. ctx == virt == 0
+        rlp_start, @SEGMENT_TRIE_DATA, // swapped dest addr, ctx == 0
         txn_rlp_len, // mcpy len
         txn_rlp_len, rlp_start, txn_counter, num_nibbles, value_ptr)
+    SWAP2 %build_kernel_address
+    // stack: DST, SRC, txn_rlp_len, txn_rlp_len, rlp_start, txn_counter, num_nibbles, value_ptr
     %memcpy_bytes
     ADD
     %set_trie_data_size
diff --git a/evm/src/cpu/kernel/asm/transactions/type_0.asm b/evm/src/cpu/kernel/asm/transactions/type_0.asm
index edd01e512d..a3f3bb0d25 100644
--- a/evm/src/cpu/kernel/asm/transactions/type_0.asm
+++ b/evm/src/cpu/kernel/asm/transactions/type_0.asm
@@ -13,68 +13,68 @@
 
 global process_type_0_txn:
     // stack: retdest
-    PUSH 0 // initial pos
-    // stack: pos, retdest
+    PUSH @SEGMENT_RLP_RAW // ctx == virt == 0
+    // stack: rlp_addr, retdest
     %decode_rlp_list_len
     // We don't actually need the length.
-    %stack (pos, len) -> (pos)
+    %stack (rlp_addr, len) -> (rlp_addr)
 
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     %decode_and_store_nonce
     %decode_and_store_gas_price_legacy
     %decode_and_store_gas_limit
     %decode_and_store_to
     %decode_and_store_value
     %decode_and_store_data
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
 
     // Parse the "v" field.
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     %decode_rlp_scalar
-    // stack: pos, v, retdest
+    // stack: rlp_addr, v, retdest
     SWAP1
-    // stack: v, pos, retdest
+    // stack: v, rlp_addr, retdest
     DUP1
     %gt_const(28)
-    // stack: v > 28, v, pos, retdest
+    // stack: v > 28, v, rlp_addr, retdest
     %jumpi(process_v_new_style)
 
     // We have an old style v, so y_parity = v - 27.
     // No chain ID is present, so we can leave TXN_FIELD_CHAIN_ID_PRESENT and
     // TXN_FIELD_CHAIN_ID with their default values of zero.
-    // stack: v, pos, retdest
+    // stack: v, rlp_addr, retdest
     %sub_const(27)
-    %stack (y_parity, pos) -> (y_parity, pos)
+    %stack (y_parity, rlp_addr) -> (y_parity, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_Y_PARITY)
 
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     %jump(decode_r_and_s)
 
 process_v_new_style:
-    // stack: v, pos, retdest
+    // stack: v, rlp_addr, retdest
     // We have a new style v, so chain_id_present = 1,
     // chain_id = (v - 35) / 2, and y_parity = (v - 35) % 2.
-    %stack (v, pos) -> (1, v, pos)
+    %stack (v, rlp_addr) -> (1, v, rlp_addr)
     %mstore_txn_field(@TXN_FIELD_CHAIN_ID_PRESENT)
 
-    // stack: v, pos, retdest
+    // stack: v, rlp_addr, retdest
     %sub_const(35)
     DUP1
-    // stack: v - 35, v - 35, pos, retdest
-    %div_const(2)
-    // stack: chain_id, v - 35, pos, retdest
+    // stack: v - 35, v - 35, rlp_addr, retdest
+    %div2
+    // stack: chain_id, v - 35, rlp_addr, retdest
     %mstore_txn_field(@TXN_FIELD_CHAIN_ID)
 
-    // stack: v - 35, pos, retdest
+    // stack: v - 35, rlp_addr, retdest
     %mod_const(2)
-    // stack: y_parity, pos, retdest
+    // stack: y_parity, rlp_addr, retdest
     %mstore_txn_field(@TXN_FIELD_Y_PARITY)
 
 decode_r_and_s:
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     %decode_and_store_r
     %decode_and_store_s
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     POP
     // stack: retdest
 
@@ -85,73 +85,68 @@ type_0_compute_signed_data:
     //     keccak256(rlp([nonce, gas_price, gas_limit, to, value, data]))
 
     %alloc_rlp_block
-    // stack: rlp_start, retdest
+    // stack: rlp_addr_start, retdest
     %mload_txn_field(@TXN_FIELD_NONCE)
-    // stack: nonce, rlp_start, retdest
+    // stack: nonce, rlp_addr_start, retdest
     DUP2
-    // stack: rlp_pos, nonce, rlp_start, retdest
+    // stack: rlp_addr, nonce, rlp_addr_start, retdest
     %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_MAX_FEE_PER_GAS)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_TO)
     %mload_global_metadata(@GLOBAL_METADATA_CONTRACT_CREATION) %jumpi(zero_to)
-    // stack: to, rlp_pos, rlp_start, retdest
+    // stack: to, rlp_addr, rlp_addr_start, retdest
     SWAP1 %encode_rlp_160
     %jump(after_to)
 zero_to:
-    // stack: to, rlp_pos, rlp_start, retdest
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: to, rlp_addr, rlp_addr_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
 after_to:
     %mload_txn_field(@TXN_FIELD_VALUE)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     // Encode txn data.
     %mload_txn_field(@TXN_FIELD_DATA_LEN)
-    PUSH 0 // ADDR.virt
     PUSH @SEGMENT_TXN_DATA
-    PUSH 0 // ADDR.context
-    // stack: ADDR: 3, len, rlp_pos, rlp_start, retdest
+    // stack: ADDR, len, rlp_addr, rlp_addr_start, retdest
     PUSH after_serializing_txn_data
-    // stack: after_serializing_txn_data, ADDR: 3, len, rlp_pos, rlp_start, retdest
-    SWAP5
-    // stack: rlp_pos, ADDR: 3, len, after_serializing_txn_data, rlp_start, retdest
+    // stack: after_serializing_txn_data, ADDR, len, rlp_addr, rlp_addr_start, retdest
+    SWAP3
+    // stack: rlp_addr, ADDR, len, after_serializing_txn_data, rlp_addr_start, retdest
     %jump(encode_rlp_string)
 
 after_serializing_txn_data:
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_addr_start, retdest
     %mload_txn_field(@TXN_FIELD_CHAIN_ID_PRESENT)
     ISZERO %jumpi(finish_rlp_list)
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_CHAIN_ID)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     PUSH 0
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     PUSH 0
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
 finish_rlp_list:
     %prepend_rlp_list_prefix
-    // stack: prefix_start_pos, rlp_len, retdest
-    PUSH @SEGMENT_RLP_RAW
-    PUSH 0 // context
-    // stack: ADDR: 3, rlp_len, retdest
+    // stack: ADDR, rlp_len, retdest
     KECCAK_GENERAL
     // stack: hash, retdest
 
diff --git a/evm/src/cpu/kernel/asm/transactions/type_1.asm b/evm/src/cpu/kernel/asm/transactions/type_1.asm
index f8396e50a4..e64a4aee03 100644
--- a/evm/src/cpu/kernel/asm/transactions/type_1.asm
+++ b/evm/src/cpu/kernel/asm/transactions/type_1.asm
@@ -8,11 +8,14 @@
 
 global process_type_1_txn:
     // stack: retdest
-    PUSH 1 // initial pos, skipping over the 0x01 byte
-    // stack: pos, retdest
+    // Initial rlp address offset of 1 (skipping over the 0x01 byte)
+    PUSH 1
+    PUSH @SEGMENT_RLP_RAW
+    %build_kernel_address
+    // stack: rlp_addr, retdest
     %decode_rlp_list_len
     // We don't actually need the length.
-    %stack (pos, len) -> (pos)
+    %stack (rlp_addr, len) -> (rlp_addr)
 
     %store_chain_id_present_true
     %decode_and_store_chain_id
@@ -27,7 +30,7 @@ global process_type_1_txn:
     %decode_and_store_r
     %decode_and_store_s
 
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     POP
     // stack: retdest
 
@@ -36,83 +39,79 @@ global process_type_1_txn:
 // over keccak256(0x01 || rlp([chainId, nonce, gasPrice, gasLimit, to, value, data, accessList])).
 type_1_compute_signed_data:
     %alloc_rlp_block
-    // stack: rlp_start, retdest
+    // stack: rlp_addr_start, retdest
     %mload_txn_field(@TXN_FIELD_CHAIN_ID)
-    // stack: chain_id, rlp_start, retdest
+    // stack: chain_id, rlp_addr_start, retdest
     DUP2
-    // stack: rlp_pos, chain_id, rlp_start, retdest
+    // stack: rlp_addr, chain_id, rlp_addr_start, retdest
     %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_NONCE)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_MAX_FEE_PER_GAS)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_TO)
     %mload_global_metadata(@GLOBAL_METADATA_CONTRACT_CREATION) %jumpi(zero_to)
-    // stack: to, rlp_pos, rlp_start, retdest
+    // stack: to, rlp_addr, rlp_addr_start, retdest
     SWAP1 %encode_rlp_160
     %jump(after_to)
 zero_to:
-    // stack: to, rlp_pos, rlp_start, retdest
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: to, rlp_addr, rlp_addr_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
 after_to:
     %mload_txn_field(@TXN_FIELD_VALUE)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_addr_start, retdest
 
     // Encode txn data.
     %mload_txn_field(@TXN_FIELD_DATA_LEN)
-    PUSH 0 // ADDR.virt
-    PUSH @SEGMENT_TXN_DATA
-    PUSH 0 // ADDR.context
-    // stack: ADDR: 3, len, rlp_pos, rlp_start, retdest
+    PUSH @SEGMENT_TXN_DATA // ctx == virt == 0
+    // stack: ADDR, len, rlp_addr, rlp_addr_start, retdest
     PUSH after_serializing_txn_data
-    // stack: after_serializing_txn_data, ADDR: 3, len, rlp_pos, rlp_start, retdest
-    SWAP5
-    // stack: rlp_pos, ADDR: 3, len, after_serializing_txn_data, rlp_start, retdest
+    // stack: after_serializing_txn_data, ADDR, len, rlp_addr, rlp_addr_start, retdest
+    SWAP3
+    // stack: rlp_addr, ADDR, len, after_serializing_txn_data, rlp_addr_start, retdest
     %jump(encode_rlp_string)
 
 after_serializing_txn_data:
     // Instead of manually encoding the access list, we just copy the raw RLP from the transaction.
     %mload_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_START)
     %mload_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_LEN)
-    %stack (al_len, al_start, rlp_pos, rlp_start, retdest) ->
+    %stack (al_len, al_start, rlp_addr, rlp_addr_start, retdest) ->
         (
-            0, @SEGMENT_RLP_RAW, rlp_pos,
-            0, @SEGMENT_RLP_RAW, al_start,
+            rlp_addr,
+            al_start,
             al_len,
             after_serializing_access_list,
-            rlp_pos, rlp_start, retdest)
-    %jump(memcpy)
+            rlp_addr, rlp_addr_start, retdest)
+    %jump(memcpy_bytes)
 after_serializing_access_list:
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_addr_start, retdest
     %mload_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_LEN) ADD
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_addr_start, retdest
     %prepend_rlp_list_prefix
-    // stack: prefix_start_pos, rlp_len, retdest
+    // stack: prefix_start_rlp_addr, rlp_len, retdest
 
     // Store a `1` in front of the RLP
     %decrement
-    %stack (pos) -> (0, @SEGMENT_RLP_RAW, pos, 1, pos)
+    %stack (rlp_addr) -> (1, rlp_addr, rlp_addr)
     MSTORE_GENERAL
-    // stack: pos, rlp_len, retdest
+    // stack: rlp_addr, rlp_len, retdest
 
     // Hash the RLP + the leading `1`
     SWAP1 %increment SWAP1
-    PUSH @SEGMENT_RLP_RAW
-    PUSH 0 // context
-    // stack: ADDR: 3, len, retdest
+    // stack: ADDR, len, retdest
     KECCAK_GENERAL
     // stack: hash, retdest
 
diff --git a/evm/src/cpu/kernel/asm/transactions/type_2.asm b/evm/src/cpu/kernel/asm/transactions/type_2.asm
index 38f1980fae..5074c57950 100644
--- a/evm/src/cpu/kernel/asm/transactions/type_2.asm
+++ b/evm/src/cpu/kernel/asm/transactions/type_2.asm
@@ -9,13 +9,16 @@
 
 global process_type_2_txn:
     // stack: retdest
-    PUSH 1 // initial pos, skipping over the 0x02 byte
-    // stack: pos, retdest
+    // Initial rlp address offset of 1 (skipping over the 0x02 byte)
+    PUSH 1
+    PUSH @SEGMENT_RLP_RAW
+    %build_kernel_address
+    // stack: rlp_addr, retdest
     %decode_rlp_list_len
     // We don't actually need the length.
-    %stack (pos, len) -> (pos)
+    %stack (rlp_addr, len) -> (rlp_addr)
 
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     %store_chain_id_present_true
     %decode_and_store_chain_id
     %decode_and_store_nonce
@@ -30,7 +33,7 @@ global process_type_2_txn:
     %decode_and_store_r
     %decode_and_store_s
 
-    // stack: pos, retdest
+    // stack: rlp_addr, retdest
     POP
     // stack: retdest
 
@@ -39,87 +42,83 @@ global process_type_2_txn:
 // keccak256(0x02 || rlp([chain_id, nonce, max_priority_fee_per_gas, max_fee_per_gas, gas_limit, destination, amount, data, access_list]))
 type_2_compute_signed_data:
     %alloc_rlp_block
-    // stack: rlp_start, retdest
+    // stack: rlp_addr_start, retdest
     %mload_txn_field(@TXN_FIELD_CHAIN_ID)
     // stack: chain_id, rlp_start, retdest
     DUP2
-    // stack: rlp_pos, chain_id, rlp_start, retdest
+    // stack: rlp_addr, chain_id, rlp_start, retdest
     %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_NONCE)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_MAX_PRIORITY_FEE_PER_GAS)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_MAX_FEE_PER_GAS)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_TO)
     %mload_global_metadata(@GLOBAL_METADATA_CONTRACT_CREATION) %jumpi(zero_to)
-    // stack: to, rlp_pos, rlp_start, retdest
+    // stack: to, rlp_addr, rlp_start, retdest
     SWAP1 %encode_rlp_160
     %jump(after_to)
 zero_to:
-    // stack: to, rlp_pos, rlp_start, retdest
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: to, rlp_addr, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_start, retdest
 
 after_to:
     %mload_txn_field(@TXN_FIELD_VALUE)
-    SWAP1 %encode_rlp_scalar
-    // stack: rlp_pos, rlp_start, retdest
+    %encode_rlp_scalar_swapped_inputs
+    // stack: rlp_addr, rlp_start, retdest
 
     // Encode txn data.
     %mload_txn_field(@TXN_FIELD_DATA_LEN)
-    PUSH 0 // ADDR.virt
-    PUSH @SEGMENT_TXN_DATA
-    PUSH 0 // ADDR.context
-    // stack: ADDR: 3, len, rlp_pos, rlp_start, retdest
+    PUSH @SEGMENT_TXN_DATA // ctx == virt == 0
+    // stack: ADDR, len, rlp_addr, rlp_start, retdest
     PUSH after_serializing_txn_data
-    // stack: after_serializing_txn_data, ADDR: 3, len, rlp_pos, rlp_start, retdest
-    SWAP5
-    // stack: rlp_pos, ADDR: 3, len, after_serializing_txn_data, rlp_start, retdest
+    // stack: after_serializing_txn_data, ADDR, len, rlp_addr, rlp_start, retdest
+    SWAP3
+    // stack: rlp_addr, ADDR, len, after_serializing_txn_data, rlp_start, retdest
     %jump(encode_rlp_string)
 
 after_serializing_txn_data:
     // Instead of manually encoding the access list, we just copy the raw RLP from the transaction.
     %mload_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_START)
     %mload_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_LEN)
-    %stack (al_len, al_start, rlp_pos, rlp_start, retdest) ->
+    %stack (al_len, al_start, rlp_addr, rlp_start, retdest) ->
         (
-            0, @SEGMENT_RLP_RAW, rlp_pos,
-            0, @SEGMENT_RLP_RAW, al_start,
+            rlp_addr,
+            al_start,
             al_len,
             after_serializing_access_list,
-            rlp_pos, rlp_start, retdest)
-    %jump(memcpy)
+            rlp_addr, rlp_start, retdest)
+    %jump(memcpy_bytes)
 after_serializing_access_list:
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_start, retdest
     %mload_global_metadata(@GLOBAL_METADATA_ACCESS_LIST_RLP_LEN) ADD
-    // stack: rlp_pos, rlp_start, retdest
+    // stack: rlp_addr, rlp_start, retdest
     %prepend_rlp_list_prefix
     // stack: prefix_start_pos, rlp_len, retdest
 
     // Store a `2` in front of the RLP
     %decrement
-    %stack (pos) -> (0, @SEGMENT_RLP_RAW, pos, 2, pos)
+    %stack (rlp_addr) -> (2, rlp_addr, rlp_addr)
     MSTORE_GENERAL
-    // stack: pos, rlp_len, retdest
+    // stack: rlp_addr, rlp_len, retdest
 
     // Hash the RLP + the leading `2`
     SWAP1 %increment SWAP1
-    PUSH @SEGMENT_RLP_RAW
-    PUSH 0 // context
-    // stack: ADDR: 3, len, retdest
+    // stack: ADDR, len, retdest
     KECCAK_GENERAL
     // stack: hash, retdest
 
diff --git a/evm/src/cpu/kernel/asm/util/assertions.asm b/evm/src/cpu/kernel/asm/util/assertions.asm
index 017ca10f2d..6c517407b1 100644
--- a/evm/src/cpu/kernel/asm/util/assertions.asm
+++ b/evm/src/cpu/kernel/asm/util/assertions.asm
@@ -24,13 +24,13 @@ global panic:
 %endmacro
 
 %macro assert_eq
-    EQ
-    %assert_nonzero
+    SUB
+    %jumpi(panic)
 %endmacro
 
 %macro assert_eq(ret)
-    EQ
-    %assert_nonzero($ret)
+    SUB
+    %jumpi($ret)
 %endmacro
 
 %macro assert_lt
@@ -82,8 +82,9 @@ global panic:
 %endmacro
 
 %macro assert_eq_const(c)
-    %eq_const($c)
-    %assert_nonzero
+    PUSH $c
+    SUB
+    %jumpi(panic)
 %endmacro
 
 %macro assert_lt_const(c)
diff --git a/evm/src/cpu/kernel/asm/util/basic_macros.asm b/evm/src/cpu/kernel/asm/util/basic_macros.asm
index fc2472b3b8..78fd34fc1c 100644
--- a/evm/src/cpu/kernel/asm/util/basic_macros.asm
+++ b/evm/src/cpu/kernel/asm/util/basic_macros.asm
@@ -8,6 +8,19 @@
     jumpi
 %endmacro
 
+// Jump to `jumpdest` if the top of the stack is != c
+%macro jump_neq_const(c, jumpdest)
+    PUSH $c
+    SUB
+    %jumpi($jumpdest)
+%endmacro
+
+// Jump to `jumpdest` if the top of the stack is < c
+%macro jumpi_lt_const(c, jumpdest)
+    %ge_const($c)
+    %jumpi($jumpdest)
+%endmacro
+
 %macro pop2
     %rep 2
         POP
@@ -271,9 +284,9 @@
 
 %macro ceil_div
     // stack: x, y
-    DUP2
-    // stack: y, x, y
-    %decrement
+    PUSH 1
+    DUP3
+    SUB // y - 1
     // stack: y - 1, x, y
     ADD
     DIV
@@ -333,7 +346,10 @@
 %endmacro
 
 %macro div2
-    %div_const(2)
+    // stack: x
+    PUSH 1
+    SHR
+    // stack: x >> 1
 %endmacro
 
 %macro iseven
@@ -410,3 +426,60 @@
     ISZERO
     // stack: not b
 %endmacro
+
+%macro build_address
+    // stack: ctx, seg, off
+    ADD
+    ADD
+    // stack: addr
+%endmacro
+
+%macro build_address_no_offset
+    // stack: ctx, seg
+    ADD
+    // stack: addr
+%endmacro
+
+%macro build_current_general_address
+    // stack: offset
+    PUSH @SEGMENT_KERNEL_GENERAL
+    GET_CONTEXT
+    %build_address
+    // stack: addr
+%endmacro
+
+%macro build_current_general_address_no_offset
+    // stack:
+    PUSH @SEGMENT_KERNEL_GENERAL
+    GET_CONTEXT
+    %build_address_no_offset
+    // stack: addr (offset == 0)
+%endmacro
+
+%macro build_kernel_address
+    // stack: seg, off
+    ADD
+    // stack: addr (ctx == 0)
+%endmacro
+
+%macro build_address_with_ctx(seg, off)
+    // stack: ctx
+    PUSH $seg
+    PUSH $off
+    %build_address
+    // stack: addr
+%endmacro
+
+%macro build_address_with_ctx_no_offset(seg)
+    // stack: ctx
+    PUSH $seg
+    ADD
+    // stack: addr
+%endmacro
+
+%macro build_address_with_ctx_no_segment(off)
+    // stack: ctx
+    PUSH $off
+    ADD
+    // stack: addr
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/util/keccak.asm b/evm/src/cpu/kernel/asm/util/keccak.asm
index 1a1f437287..dceb7b195b 100644
--- a/evm/src/cpu/kernel/asm/util/keccak.asm
+++ b/evm/src/cpu/kernel/asm/util/keccak.asm
@@ -18,7 +18,8 @@ global sys_keccak256:
     %stack (kexit_info, offset, len) -> (offset, len, kexit_info)
     PUSH @SEGMENT_MAIN_MEMORY
     GET_CONTEXT
-    // stack: ADDR: 3, len, kexit_info
+    %build_address
+    // stack: ADDR, len, kexit_info
     KECCAK_GENERAL
     // stack: hash, kexit_info
     SWAP1
@@ -37,11 +38,11 @@ sys_keccak256_empty:
 %macro keccak256_word(num_bytes)
     // Since KECCAK_GENERAL takes its input from memory, we will first write
     // input_word's bytes to @SEGMENT_KERNEL_GENERAL[0..$num_bytes].
-    %stack (word) -> (0, @SEGMENT_KERNEL_GENERAL, 0, word, $num_bytes, %%after_mstore)
+    %stack (word) -> (@SEGMENT_KERNEL_GENERAL, word, $num_bytes, %%after_mstore, $num_bytes, $num_bytes)
     %jump(mstore_unpacking)
 %%after_mstore:
-    // stack: offset
-    %stack (offset) -> (0, @SEGMENT_KERNEL_GENERAL, 0, $num_bytes) // context, segment, offset, len
+    // stack: addr, $num_bytes, $num_bytes
+    SUB
     KECCAK_GENERAL
 %endmacro
 
@@ -53,12 +54,11 @@ sys_keccak256_empty:
     // Since KECCAK_GENERAL takes its input from memory, we will first write
     // a's bytes to @SEGMENT_KERNEL_GENERAL[0..32], then b's bytes to
     // @SEGMENT_KERNEL_GENERAL[32..64].
-    %stack (a) -> (0, @SEGMENT_KERNEL_GENERAL, 0, a, 32, %%after_mstore_a)
-    %jump(mstore_unpacking)
-%%after_mstore_a:
-    %stack (offset, b) -> (0, @SEGMENT_KERNEL_GENERAL, 32, b, 32, %%after_mstore_b)
-    %jump(mstore_unpacking)
-%%after_mstore_b:
-    %stack (offset) -> (0, @SEGMENT_KERNEL_GENERAL, 0, 64) // context, segment, offset, len
+    %stack (a) -> (@SEGMENT_KERNEL_GENERAL, a)
+    MSTORE_32BYTES_32
+    // stack: addr, b
+    MSTORE_32BYTES_32
+    %stack (addr) -> (addr, 64, 64) // reset the address offset
+    SUB
     KECCAK_GENERAL
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/util/math.asm b/evm/src/cpu/kernel/asm/util/math.asm
index 98f7b0086d..4bdf690238 100644
--- a/evm/src/cpu/kernel/asm/util/math.asm
+++ b/evm/src/cpu/kernel/asm/util/math.asm
@@ -5,7 +5,7 @@ log2_floor_helper:
     ISZERO
     %jumpi(end)
     // stack: val, counter, retdest
-    %div_const(2)
+    %div2
     // stack: val/2, counter, retdest
     SWAP1
     %increment
@@ -22,7 +22,7 @@ end:
 
 global log2_floor:
     // stack: val, retdest
-    %div_const(2)
+    %div2
     // stack: val/2, retdest
     PUSH 0
     // stack: 0, val/2, retdest
diff --git a/evm/src/cpu/kernel/assembler.rs b/evm/src/cpu/kernel/assembler.rs
index d708d22829..2dc79d6111 100644
--- a/evm/src/cpu/kernel/assembler.rs
+++ b/evm/src/cpu/kernel/assembler.rs
@@ -2,13 +2,13 @@ use std::collections::HashMap;
 use std::fs;
 use std::time::Instant;
 
-use ethereum_types::U256;
+use ethereum_types::{H256, U256};
 use itertools::{izip, Itertools};
 use keccak_hash::keccak;
 use log::debug;
 use serde::{Deserialize, Serialize};
 
-use super::ast::PushTarget;
+use super::ast::{BytesTarget, PushTarget};
 use crate::cpu::kernel::ast::Item::LocalLabelDeclaration;
 use crate::cpu::kernel::ast::{File, Item, StackReplacement};
 use crate::cpu::kernel::opcodes::{get_opcode, get_push_opcode};
@@ -26,9 +26,8 @@ pub(crate) const BYTES_PER_OFFSET: u8 = 3;
 pub struct Kernel {
     pub(crate) code: Vec<u8>,
 
-    /// Computed using `hash_kernel`. It is encoded as `u32` limbs for convenience, since we deal
-    /// with `u32` limbs in our Keccak table.
-    pub(crate) code_hash: [u32; 8],
+    /// Computed using `hash_kernel`.
+    pub(crate) code_hash: H256,
 
     pub(crate) global_labels: HashMap<String, usize>,
     pub(crate) ordered_labels: Vec<String>,
@@ -43,11 +42,7 @@ impl Kernel {
         global_labels: HashMap<String, usize>,
         prover_inputs: HashMap<usize, ProverInputFn>,
     ) -> Self {
-        let code_hash_bytes = keccak(&code).0;
-        let code_hash_be = core::array::from_fn(|i| {
-            u32::from_le_bytes(core::array::from_fn(|j| code_hash_bytes[i * 4 + j]))
-        });
-        let code_hash = code_hash_be.map(u32::from_be);
+        let code_hash = keccak(&code);
         let ordered_labels = global_labels
             .keys()
             .cloned()
@@ -277,6 +272,23 @@ fn inline_constants(body: Vec<Item>, constants: &HashMap<String, U256>) -> Vec<I
         .map(|item| {
             if let Item::Push(PushTarget::Constant(c)) = item {
                 Item::Push(PushTarget::Literal(resolve_const(c)))
+            } else if let Item::Bytes(targets) = item {
+                let targets = targets
+                    .into_iter()
+                    .map(|target| {
+                        if let BytesTarget::Constant(c) = target {
+                            let c = resolve_const(c);
+                            assert!(
+                                c < U256::from(256),
+                                "Constant in a BYTES object should be a byte"
+                            );
+                            BytesTarget::Literal(c.byte(0))
+                        } else {
+                            target
+                        }
+                    })
+                    .collect();
+                Item::Bytes(targets)
             } else if let Item::StackManipulation(from, to) = item {
                 let to = to
                     .into_iter()
@@ -387,7 +399,14 @@ fn assemble_file(
             Item::StandardOp(opcode) => {
                 code.push(get_opcode(&opcode));
             }
-            Item::Bytes(bytes) => code.extend(bytes),
+            Item::Bytes(targets) => {
+                for target in targets {
+                    match target {
+                        BytesTarget::Literal(n) => code.push(n),
+                        BytesTarget::Constant(c) => panic!("Constant wasn't inlined: {c}"),
+                    }
+                }
+            }
             Item::Jumptable(labels) => {
                 for label in labels {
                     let bytes = look_up_label(&label, &local_labels, global_labels);
@@ -411,12 +430,7 @@ fn push_target_size(target: &PushTarget) -> u8 {
 
 #[cfg(test)]
 mod tests {
-    use std::collections::HashMap;
-
-    use itertools::Itertools;
-
-    use crate::cpu::kernel::assembler::*;
-    use crate::cpu::kernel::ast::*;
+    use super::*;
     use crate::cpu::kernel::parser::parse;
 
     #[test]
@@ -507,7 +521,10 @@ mod tests {
     #[test]
     fn literal_bytes() {
         let file = File {
-            body: vec![Item::Bytes(vec![0x12, 42]), Item::Bytes(vec![0xFE, 255])],
+            body: vec![
+                Item::Bytes(vec![BytesTarget::Literal(0x12), BytesTarget::Literal(42)]),
+                Item::Bytes(vec![BytesTarget::Literal(0xFE), BytesTarget::Literal(255)]),
+            ],
         };
         let code = assemble(vec![file], HashMap::new(), false).code;
         assert_eq!(code, vec![0x12, 42, 0xfe, 255]);
diff --git a/evm/src/cpu/kernel/ast.rs b/evm/src/cpu/kernel/ast.rs
index ed4f6dbb1b..0af3bdabeb 100644
--- a/evm/src/cpu/kernel/ast.rs
+++ b/evm/src/cpu/kernel/ast.rs
@@ -33,7 +33,7 @@ pub(crate) enum Item {
     /// Any opcode besides a PUSH opcode.
     StandardOp(String),
     /// Literal hex data; should contain an even number of hex chars.
-    Bytes(Vec<u8>),
+    Bytes(Vec<BytesTarget>),
     /// Creates a table of addresses from a list of labels.
     Jumptable(Vec<String>),
 }
@@ -75,3 +75,10 @@ pub(crate) enum PushTarget {
     MacroVar(String),
     Constant(String),
 }
+
+/// The target of a `BYTES` item.
+#[derive(Clone, Debug, Eq, PartialEq, Hash)]
+pub(crate) enum BytesTarget {
+    Literal(u8),
+    Constant(String),
+}
diff --git a/evm/src/cpu/kernel/constants/context_metadata.rs b/evm/src/cpu/kernel/constants/context_metadata.rs
index fd710eec40..ffcc65387a 100644
--- a/evm/src/cpu/kernel/constants/context_metadata.rs
+++ b/evm/src/cpu/kernel/constants/context_metadata.rs
@@ -1,40 +1,52 @@
+use crate::memory::segments::Segment;
+
 /// These metadata fields contain VM state specific to a particular context.
+///
+/// Each value is directly scaled by the corresponding `Segment::ContextMetadata` value for faster
+/// memory access in the kernel.
+#[allow(clippy::enum_clike_unportable_variant)]
+#[repr(usize)]
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd, Debug)]
 pub(crate) enum ContextMetadata {
     /// The ID of the context which created this one.
-    ParentContext = 0,
+    ParentContext = Segment::ContextMetadata as usize,
     /// The program counter to return to when we return to the parent context.
-    ParentProgramCounter = 1,
-    CalldataSize = 2,
-    ReturndataSize = 3,
+    ParentProgramCounter,
+    CalldataSize,
+    ReturndataSize,
     /// The address of the account associated with this context.
-    Address = 4,
+    Address,
     /// The size of the code under the account associated with this context.
     /// While this information could be obtained from the state trie, it is best to cache it since
     /// the `CODESIZE` instruction is very cheap.
-    CodeSize = 5,
+    CodeSize,
     /// The address of the caller who spawned this context.
-    Caller = 6,
+    Caller,
     /// The value (in wei) deposited by the caller.
-    CallValue = 7,
+    CallValue,
     /// Whether this context was created by `STATICCALL`, in which case state changes are
     /// prohibited.
-    Static = 8,
+    Static,
     /// Pointer to the initial version of the state trie, at the creation of this context. Used when
     /// we need to revert a context.
-    StateTrieCheckpointPointer = 9,
+    StateTrieCheckpointPointer,
     /// Size of the active main memory, in (32 byte) words.
-    MemWords = 10,
-    StackSize = 11,
+    MemWords,
+    StackSize,
     /// The gas limit for this call (not the entire transaction).
-    GasLimit = 12,
-    ContextCheckpointsLen = 13,
+    GasLimit,
+    ContextCheckpointsLen,
 }
 
 impl ContextMetadata {
     pub(crate) const COUNT: usize = 14;
 
-    pub(crate) fn all() -> [Self; Self::COUNT] {
+    /// Unscales this virtual offset by their respective `Segment` value.
+    pub(crate) const fn unscale(&self) -> usize {
+        *self as usize - Segment::ContextMetadata as usize
+    }
+
+    pub(crate) const fn all() -> [Self; Self::COUNT] {
         [
             Self::ParentContext,
             Self::ParentProgramCounter,
@@ -54,7 +66,7 @@ impl ContextMetadata {
     }
 
     /// The variable name that gets passed into kernel assembly code.
-    pub(crate) fn var_name(&self) -> &'static str {
+    pub(crate) const fn var_name(&self) -> &'static str {
         match self {
             ContextMetadata::ParentContext => "CTX_METADATA_PARENT_CONTEXT",
             ContextMetadata::ParentProgramCounter => "CTX_METADATA_PARENT_PC",
diff --git a/evm/src/cpu/kernel/constants/exc_bitfields.rs b/evm/src/cpu/kernel/constants/exc_bitfields.rs
index ff0782b322..1603ef52bb 100644
--- a/evm/src/cpu/kernel/constants/exc_bitfields.rs
+++ b/evm/src/cpu/kernel/constants/exc_bitfields.rs
@@ -1,4 +1,4 @@
-use std::ops::RangeInclusive;
+use core::ops::RangeInclusive;
 
 use ethereum_types::U256;
 
@@ -28,7 +28,7 @@ const fn u256_from_set_index_ranges<const N: usize>(ranges: &[RangeInclusive<u8>
     U256(res_limbs)
 }
 
-pub const STACK_LENGTH_INCREASING_OPCODES_USER: U256 = u256_from_set_index_ranges(&[
+pub(crate) const STACK_LENGTH_INCREASING_OPCODES_USER: U256 = u256_from_set_index_ranges(&[
     0x30..=0x30, // ADDRESS
     0x32..=0x34, // ORIGIN, CALLER, CALLVALUE
     0x36..=0x36, // CALLDATASIZE
@@ -41,7 +41,7 @@ pub const STACK_LENGTH_INCREASING_OPCODES_USER: U256 = u256_from_set_index_range
     0x5f..=0x8f, // PUSH*, DUP*
 ]);
 
-pub const INVALID_OPCODES_USER: U256 = u256_from_set_index_ranges(&[
+pub(crate) const INVALID_OPCODES_USER: U256 = u256_from_set_index_ranges(&[
     0x0c..=0x0f,
     0x1e..=0x1f,
     0x21..=0x2f,
diff --git a/evm/src/cpu/kernel/constants/global_metadata.rs b/evm/src/cpu/kernel/constants/global_metadata.rs
index 77e64fe25d..9e85d467f8 100644
--- a/evm/src/cpu/kernel/constants/global_metadata.rs
+++ b/evm/src/cpu/kernel/constants/global_metadata.rs
@@ -1,99 +1,111 @@
+use crate::memory::segments::Segment;
+
 /// These metadata fields contain global VM state, stored in the `Segment::Metadata` segment of the
 /// kernel's context (which is zero).
+///
+/// Each value is directly scaled by the corresponding `Segment::GlobalMetadata` value for faster
+/// memory access in the kernel.
+#[allow(clippy::enum_clike_unportable_variant)]
+#[repr(usize)]
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd, Debug)]
 pub(crate) enum GlobalMetadata {
     /// The largest context ID that has been used so far in this execution. Tracking this allows us
     /// give each new context a unique ID, so that its memory will be zero-initialized.
-    LargestContext = 0,
+    LargestContext = Segment::GlobalMetadata as usize,
     /// The size of active memory, in bytes.
-    MemorySize = 1,
-    /// The size of the `TrieData` segment, in bytes. In other words, the next address available for
-    /// appending additional trie data.
-    TrieDataSize = 2,
+    MemorySize,
     /// The size of the `TrieData` segment, in bytes. In other words, the next address available for
     /// appending additional trie data.
-    RlpDataSize = 3,
+    TrieDataSize,
+    /// The size of the `TrieData` segment, in bytes, represented as a whole address.
+    /// In other words, the next address available for appending additional trie data.
+    RlpDataSize,
     /// A pointer to the root of the state trie within the `TrieData` buffer.
-    StateTrieRoot = 4,
+    StateTrieRoot,
     /// A pointer to the root of the transaction trie within the `TrieData` buffer.
-    TransactionTrieRoot = 5,
+    TransactionTrieRoot,
     /// A pointer to the root of the receipt trie within the `TrieData` buffer.
-    ReceiptTrieRoot = 6,
+    ReceiptTrieRoot,
 
     // The root digests of each Merkle trie before these transactions.
-    StateTrieRootDigestBefore = 7,
-    TransactionTrieRootDigestBefore = 8,
-    ReceiptTrieRootDigestBefore = 9,
+    StateTrieRootDigestBefore,
+    TransactionTrieRootDigestBefore,
+    ReceiptTrieRootDigestBefore,
 
     // The root digests of each Merkle trie after these transactions.
-    StateTrieRootDigestAfter = 10,
-    TransactionTrieRootDigestAfter = 11,
-    ReceiptTrieRootDigestAfter = 12,
-
-    /// The sizes of the `TrieEncodedChild` and `TrieEncodedChildLen` buffers. In other words, the
-    /// next available offset in these buffers.
-    TrieEncodedChildSize = 13,
+    StateTrieRootDigestAfter,
+    TransactionTrieRootDigestAfter,
+    ReceiptTrieRootDigestAfter,
 
     // Block metadata.
-    BlockBeneficiary = 14,
-    BlockTimestamp = 15,
-    BlockNumber = 16,
-    BlockDifficulty = 17,
-    BlockRandom = 18,
-    BlockGasLimit = 19,
-    BlockChainId = 20,
-    BlockBaseFee = 21,
-    BlockGasUsed = 22,
+    BlockBeneficiary,
+    BlockTimestamp,
+    BlockNumber,
+    BlockDifficulty,
+    BlockRandom,
+    BlockGasLimit,
+    BlockChainId,
+    BlockBaseFee,
+    BlockGasUsed,
     /// Before current transactions block values.
-    BlockGasUsedBefore = 23,
+    BlockGasUsedBefore,
     /// After current transactions block values.
-    BlockGasUsedAfter = 24,
+    BlockGasUsedAfter,
     /// Current block header hash
-    BlockCurrentHash = 25,
+    BlockCurrentHash,
 
     /// Gas to refund at the end of the transaction.
-    RefundCounter = 26,
+    RefundCounter,
     /// Length of the addresses access list.
-    AccessedAddressesLen = 27,
+    AccessedAddressesLen,
     /// Length of the storage keys access list.
-    AccessedStorageKeysLen = 28,
+    AccessedStorageKeysLen,
     /// Length of the self-destruct list.
-    SelfDestructListLen = 29,
+    SelfDestructListLen,
     /// Length of the bloom entry buffer.
-    BloomEntryLen = 30,
+    BloomEntryLen,
 
     /// Length of the journal.
-    JournalLen = 31,
+    JournalLen,
     /// Length of the `JournalData` segment.
-    JournalDataLen = 32,
+    JournalDataLen,
     /// Current checkpoint.
-    CurrentCheckpoint = 33,
-    TouchedAddressesLen = 34,
+    CurrentCheckpoint,
+    TouchedAddressesLen,
     // Gas cost for the access list in type-1 txns. See EIP-2930.
-    AccessListDataCost = 35,
+    AccessListDataCost,
     // Start of the access list in the RLP for type-1 txns.
-    AccessListRlpStart = 36,
+    AccessListRlpStart,
     // Length of the access list in the RLP for type-1 txns.
-    AccessListRlpLen = 37,
+    AccessListRlpLen,
     // Boolean flag indicating if the txn is a contract creation txn.
-    ContractCreation = 38,
-    IsPrecompileFromEoa = 39,
-    CallStackDepth = 40,
-    /// Transaction logs list length.
-    LogsLen = 41,
-    LogsDataLen = 42,
-    LogsPayloadLen = 43,
-    TxnNumberBefore = 44,
-    TxnNumberAfter = 45,
-    BlockBlobBaseFee = 46,
+    ContractCreation,
+    IsPrecompileFromEoa,
+    CallStackDepth,
+    /// Transaction logs list length
+    LogsLen,
+    LogsDataLen,
+    LogsPayloadLen,
+    TxnNumberBefore,
+    TxnNumberAfter,
+    BlockBlobBaseFee,
+
     /// Number of created contracts during the current transaction.
-    CreatedContractsLen = 47,
+    CreatedContractsLen,
+
+    KernelHash,
+    KernelLen,
 }
 
 impl GlobalMetadata {
-    pub(crate) const COUNT: usize = 48;
+    pub(crate) const COUNT: usize = 49;
+
+    /// Unscales this virtual offset by their respective `Segment` value.
+    pub(crate) const fn unscale(&self) -> usize {
+        *self as usize - Segment::GlobalMetadata as usize
+    }
 
-    pub(crate) fn all() -> [Self; Self::COUNT] {
+    pub(crate) const fn all() -> [Self; Self::COUNT] {
         [
             Self::LargestContext,
             Self::MemorySize,
@@ -108,7 +120,6 @@ impl GlobalMetadata {
             Self::StateTrieRootDigestAfter,
             Self::TransactionTrieRootDigestAfter,
             Self::ReceiptTrieRootDigestAfter,
-            Self::TrieEncodedChildSize,
             Self::BlockBeneficiary,
             Self::BlockTimestamp,
             Self::BlockNumber,
@@ -143,11 +154,13 @@ impl GlobalMetadata {
             Self::TxnNumberAfter,
             Self::BlockBlobBaseFee,
             Self::CreatedContractsLen,
+            Self::KernelHash,
+            Self::KernelLen,
         ]
     }
 
     /// The variable name that gets passed into kernel assembly code.
-    pub(crate) fn var_name(&self) -> &'static str {
+    pub(crate) const fn var_name(&self) -> &'static str {
         match self {
             Self::LargestContext => "GLOBAL_METADATA_LARGEST_CONTEXT",
             Self::MemorySize => "GLOBAL_METADATA_MEMORY_SIZE",
@@ -162,7 +175,6 @@ impl GlobalMetadata {
             Self::StateTrieRootDigestAfter => "GLOBAL_METADATA_STATE_TRIE_DIGEST_AFTER",
             Self::TransactionTrieRootDigestAfter => "GLOBAL_METADATA_TXN_TRIE_DIGEST_AFTER",
             Self::ReceiptTrieRootDigestAfter => "GLOBAL_METADATA_RECEIPT_TRIE_DIGEST_AFTER",
-            Self::TrieEncodedChildSize => "GLOBAL_METADATA_TRIE_ENCODED_CHILD_SIZE",
             Self::BlockBeneficiary => "GLOBAL_METADATA_BLOCK_BENEFICIARY",
             Self::BlockTimestamp => "GLOBAL_METADATA_BLOCK_TIMESTAMP",
             Self::BlockNumber => "GLOBAL_METADATA_BLOCK_NUMBER",
@@ -197,6 +209,8 @@ impl GlobalMetadata {
             Self::TxnNumberAfter => "GLOBAL_METADATA_TXN_NUMBER_AFTER",
             Self::BlockBlobBaseFee => "GLOBAL_METADATA_BLOCK_BLOB_BASE_FEE",
             Self::CreatedContractsLen => "GLOBAL_METADATA_CREATED_CONTRACTS_LEN",
+            Self::KernelHash => "GLOBAL_METADATA_KERNEL_HASH",
+            Self::KernelLen => "GLOBAL_METADATA_KERNEL_LEN",
         }
     }
 }
diff --git a/evm/src/cpu/kernel/constants/journal_entry.rs b/evm/src/cpu/kernel/constants/journal_entry.rs
index 8015ce2162..d84f2ade8f 100644
--- a/evm/src/cpu/kernel/constants/journal_entry.rs
+++ b/evm/src/cpu/kernel/constants/journal_entry.rs
@@ -1,4 +1,3 @@
-#[allow(dead_code)]
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd, Debug)]
 pub(crate) enum JournalEntry {
     AccountLoaded = 0,
@@ -17,7 +16,7 @@ pub(crate) enum JournalEntry {
 impl JournalEntry {
     pub(crate) const COUNT: usize = 11;
 
-    pub(crate) fn all() -> [Self; Self::COUNT] {
+    pub(crate) const fn all() -> [Self; Self::COUNT] {
         [
             Self::AccountLoaded,
             Self::AccountDestroyed,
@@ -34,7 +33,7 @@ impl JournalEntry {
     }
 
     /// The variable name that gets passed into kernel assembly code.
-    pub(crate) fn var_name(&self) -> &'static str {
+    pub(crate) const fn var_name(&self) -> &'static str {
         match self {
             Self::AccountLoaded => "JOURNAL_ENTRY_ACCOUNT_LOADED",
             Self::AccountDestroyed => "JOURNAL_ENTRY_ACCOUNT_DESTROYED",
diff --git a/evm/src/cpu/kernel/constants/mod.rs b/evm/src/cpu/kernel/constants/mod.rs
index 77abde994f..82c820f054 100644
--- a/evm/src/cpu/kernel/constants/mod.rs
+++ b/evm/src/cpu/kernel/constants/mod.rs
@@ -18,7 +18,7 @@ pub(crate) mod trie_type;
 pub(crate) mod txn_fields;
 
 /// Constants that are accessible to our kernel assembly code.
-pub fn evm_constants() -> HashMap<String, U256> {
+pub(crate) fn evm_constants() -> HashMap<String, U256> {
     let mut c = HashMap::new();
 
     let hex_constants = MISC_CONSTANTS
@@ -58,16 +58,19 @@ pub fn evm_constants() -> HashMap<String, U256> {
     c.insert(CALL_STACK_LIMIT.0.into(), U256::from(CALL_STACK_LIMIT.1));
 
     for segment in Segment::all() {
-        c.insert(segment.var_name().into(), (segment as u32).into());
+        c.insert(segment.var_name().into(), (segment as usize).into());
     }
     for txn_field in NormalizedTxnField::all() {
-        c.insert(txn_field.var_name().into(), (txn_field as u32).into());
+        // These offsets are already scaled by their respective segment.
+        c.insert(txn_field.var_name().into(), (txn_field as usize).into());
     }
     for txn_field in GlobalMetadata::all() {
-        c.insert(txn_field.var_name().into(), (txn_field as u32).into());
+        // These offsets are already scaled by their respective segment.
+        c.insert(txn_field.var_name().into(), (txn_field as usize).into());
     }
     for txn_field in ContextMetadata::all() {
-        c.insert(txn_field.var_name().into(), (txn_field as u32).into());
+        // These offsets are already scaled by their respective segment.
+        c.insert(txn_field.var_name().into(), (txn_field as usize).into());
     }
     for trie_type in PartialTrieType::all() {
         c.insert(trie_type.var_name().into(), (trie_type as u32).into());
@@ -86,12 +89,23 @@ pub fn evm_constants() -> HashMap<String, U256> {
     c
 }
 
-const MISC_CONSTANTS: [(&str, [u8; 32]); 1] = [
+const MISC_CONSTANTS: [(&str, [u8; 32]); 3] = [
     // Base for limbs used in bignum arithmetic.
     (
         "BIGNUM_LIMB_BASE",
         hex!("0000000000000000000000000000000100000000000000000000000000000000"),
     ),
+    // Position in SEGMENT_RLP_RAW where the empty node encoding is stored. It is
+    // equal to u32::MAX + @SEGMENT_RLP_RAW so that all rlp pointers are much smaller than that.
+    (
+        "ENCODED_EMPTY_NODE_POS",
+        hex!("0000000000000000000000000000000000000000000000000000000CFFFFFFFF"),
+    ),
+    // 0x10000 = 2^16 bytes, much larger than any RLP blob the EVM could possibly create.
+    (
+        "MAX_RLP_BLOB_SIZE",
+        hex!("0000000000000000000000000000000000000000000000000000000000010000"),
+    ),
 ];
 
 const HASH_CONSTANTS: [(&str, [u8; 32]); 2] = [
@@ -154,7 +168,7 @@ const EC_CONSTANTS: [(&str, [u8; 32]); 20] = [
     ),
     (
         "BN_BNEG_LOC",
-        // This just needs to be large enough to not interfere with anything else in SEGMENT_KERNEL_BN_TABLE_Q.
+        // This just needs to be large enough to not interfere with anything else in SEGMENT_BN_TABLE_Q.
         hex!("0000000000000000000000000000000000000000000000000000000000001337"),
     ),
     (
diff --git a/evm/src/cpu/kernel/constants/trie_type.rs b/evm/src/cpu/kernel/constants/trie_type.rs
index 7f936529e5..fd89f41000 100644
--- a/evm/src/cpu/kernel/constants/trie_type.rs
+++ b/evm/src/cpu/kernel/constants/trie_type.rs
@@ -1,4 +1,4 @@
-use std::ops::Deref;
+use core::ops::Deref;
 
 use eth_trie_utils::partial_trie::HashedPartialTrie;
 
@@ -26,7 +26,7 @@ impl PartialTrieType {
         }
     }
 
-    pub(crate) fn all() -> [Self; Self::COUNT] {
+    pub(crate) const fn all() -> [Self; Self::COUNT] {
         [
             Self::Empty,
             Self::Hash,
@@ -37,7 +37,7 @@ impl PartialTrieType {
     }
 
     /// The variable name that gets passed into kernel assembly code.
-    pub(crate) fn var_name(&self) -> &'static str {
+    pub(crate) const fn var_name(&self) -> &'static str {
         match self {
             Self::Empty => "MPT_NODE_EMPTY",
             Self::Hash => "MPT_NODE_HASH",
diff --git a/evm/src/cpu/kernel/constants/txn_fields.rs b/evm/src/cpu/kernel/constants/txn_fields.rs
index f4364c6f07..0b74409b37 100644
--- a/evm/src/cpu/kernel/constants/txn_fields.rs
+++ b/evm/src/cpu/kernel/constants/txn_fields.rs
@@ -1,35 +1,47 @@
+use crate::memory::segments::Segment;
+
 /// These are normalized transaction fields, i.e. not specific to any transaction type.
+///
+/// Each value is directly scaled by the corresponding `Segment::TxnFields` value for faster
+/// memory access in the kernel.
 #[allow(dead_code)]
+#[allow(clippy::enum_clike_unportable_variant)]
+#[repr(usize)]
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd, Debug)]
 pub(crate) enum NormalizedTxnField {
     /// Whether a chain ID was present in the txn data. Type 0 transaction with v=27 or v=28 have
     /// no chain ID. This affects what fields get signed.
-    ChainIdPresent = 0,
-    ChainId = 1,
-    Nonce = 2,
-    MaxPriorityFeePerGas = 3,
-    MaxFeePerGas = 4,
-    GasLimit = 6,
-    IntrinsicGas = 7,
-    To = 8,
-    Value = 9,
+    ChainIdPresent = Segment::TxnFields as usize,
+    ChainId,
+    Nonce,
+    MaxPriorityFeePerGas,
+    MaxFeePerGas,
+    GasLimit,
+    IntrinsicGas,
+    To,
+    Value,
     /// The length of the data field. The data itself is stored in another segment.
-    DataLen = 10,
-    YParity = 11,
-    R = 12,
-    S = 13,
-    Origin = 14,
+    DataLen,
+    YParity,
+    R,
+    S,
+    Origin,
 
     /// The actual computed gas price for this transaction in the block.
     /// This is not technically a transaction field, as it depends on the block's base fee.
-    ComputedFeePerGas = 15,
-    ComputedPriorityFeePerGas = 16,
+    ComputedFeePerGas,
+    ComputedPriorityFeePerGas,
 }
 
 impl NormalizedTxnField {
     pub(crate) const COUNT: usize = 16;
 
-    pub(crate) fn all() -> [Self; Self::COUNT] {
+    /// Unscales this virtual offset by their respective `Segment` value.
+    pub(crate) const fn unscale(&self) -> usize {
+        *self as usize - Segment::TxnFields as usize
+    }
+
+    pub(crate) const fn all() -> [Self; Self::COUNT] {
         [
             Self::ChainIdPresent,
             Self::ChainId,
@@ -51,7 +63,7 @@ impl NormalizedTxnField {
     }
 
     /// The variable name that gets passed into kernel assembly code.
-    pub(crate) fn var_name(&self) -> &'static str {
+    pub(crate) const fn var_name(&self) -> &'static str {
         match self {
             NormalizedTxnField::ChainIdPresent => "TXN_FIELD_CHAIN_ID_PRESENT",
             NormalizedTxnField::ChainId => "TXN_FIELD_CHAIN_ID",
diff --git a/evm/src/cpu/kernel/cost_estimator.rs b/evm/src/cpu/kernel/cost_estimator.rs
index ae8376479d..70cc726772 100644
--- a/evm/src/cpu/kernel/cost_estimator.rs
+++ b/evm/src/cpu/kernel/cost_estimator.rs
@@ -25,13 +25,12 @@ fn cost_estimate_item(item: &Item) -> u32 {
     }
 }
 
-fn cost_estimate_standard_op(_op: &str) -> u32 {
+const fn cost_estimate_standard_op(_op: &str) -> u32 {
     // For now we just treat any standard operation as having the same cost. This is pretty naive,
     // but should work fine with our current set of simple optimization rules.
     1
 }
 
-fn cost_estimate_push(num_bytes: usize) -> u32 {
-    // TODO: Once PUSH is actually implemented, check if this needs to be revised.
+const fn cost_estimate_push(num_bytes: usize) -> u32 {
     num_bytes as u32
 }
diff --git a/evm/src/cpu/kernel/evm_asm.pest b/evm/src/cpu/kernel/evm_asm.pest
index 3243aecc56..40dec03b3e 100644
--- a/evm/src/cpu/kernel/evm_asm.pest
+++ b/evm/src/cpu/kernel/evm_asm.pest
@@ -34,7 +34,8 @@ local_label_decl = ${ identifier ~ ":" }
 macro_label_decl = ${ "%%" ~ identifier ~ ":" }
 macro_label = ${ "%%" ~ identifier }
 
-bytes_item = { ^"BYTES " ~ literal ~ ("," ~ literal)* }
+bytes_item = { ^"BYTES " ~ bytes_target ~ ("," ~ bytes_target)* }
+bytes_target = { literal | constant }
 jumptable_item = { ^"JUMPTABLE " ~ identifier ~ ("," ~ identifier)* }
 push_instruction = { ^"PUSH " ~ push_target }
 push_target = { literal | identifier | macro_label | variable | constant }
diff --git a/evm/src/cpu/kernel/interpreter.rs b/evm/src/cpu/kernel/interpreter.rs
index e5ad9537bf..8d18639fca 100644
--- a/evm/src/cpu/kernel/interpreter.rs
+++ b/evm/src/cpu/kernel/interpreter.rs
@@ -1,24 +1,35 @@
 //! An EVM interpreter for testing and debugging purposes.
 
 use core::cmp::Ordering;
-use std::collections::HashMap;
-use std::ops::Range;
+use core::ops::Range;
+use std::collections::{BTreeSet, HashMap};
 
-use anyhow::{anyhow, bail, ensure};
-use ethereum_types::{U256, U512};
+use anyhow::bail;
+use eth_trie_utils::partial_trie::PartialTrie;
+use ethereum_types::{BigEndianHash, H160, H256, U256, U512};
 use keccak_hash::keccak;
 use plonky2::field::goldilocks_field::GoldilocksField;
 
+use super::assembler::BYTES_PER_OFFSET;
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::context_metadata::ContextMetadata;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cpu::kernel::constants::txn_fields::NormalizedTxnField;
+use crate::cpu::stack::MAX_USER_STACK_SIZE;
 use crate::extension_tower::BN_BASE;
+use crate::generation::mpt::load_all_mpts;
 use crate::generation::prover_input::ProverInputFn;
-use crate::generation::state::GenerationState;
+use crate::generation::rlp::all_rlp_prover_inputs_reversed;
+use crate::generation::state::{all_withdrawals_prover_inputs_reversed, GenerationState};
 use crate::generation::GenerationInputs;
-use crate::memory::segments::Segment;
+use crate::memory::segments::{Segment, SEGMENT_SCALING_FACTOR};
+use crate::util::{h2u, u256_to_usize};
+use crate::witness::errors::{ProgramError, ProverInputError};
+use crate::witness::gas::gas_to_charge;
 use crate::witness::memory::{MemoryAddress, MemoryContextState, MemorySegmentState, MemoryState};
+use crate::witness::operation::{Operation, CONTEXT_SCALING_FACTOR};
+use crate::witness::state::RegistersState;
+use crate::witness::transition::decode;
 use crate::witness::util::stack_peek;
 
 type F = GoldilocksField;
@@ -31,24 +42,45 @@ impl MemoryState {
         self.get(MemoryAddress::new(context, segment, offset))
     }
 
-    fn mstore_general(&mut self, context: usize, segment: Segment, offset: usize, value: U256) {
+    fn mstore_general(
+        &mut self,
+        context: usize,
+        segment: Segment,
+        offset: usize,
+        value: U256,
+    ) -> InterpreterMemOpKind {
+        let old_value = self.mload_general(context, segment, offset);
         self.set(MemoryAddress::new(context, segment, offset), value);
+        InterpreterMemOpKind::Write(old_value, context, segment as usize, offset)
     }
 }
 
-pub struct Interpreter<'a> {
-    kernel_mode: bool,
-    jumpdests: Vec<usize>,
-    pub(crate) context: usize,
+pub(crate) struct Interpreter<'a> {
     pub(crate) generation_state: GenerationState<F>,
     prover_inputs_map: &'a HashMap<usize, ProverInputFn>,
     pub(crate) halt_offsets: Vec<usize>,
     pub(crate) debug_offsets: Vec<usize>,
     running: bool,
     opcode_count: [usize; 0x100],
+    memops: Vec<InterpreterMemOpKind>,
+}
+
+/// Structure storing the state of the interpreter's registers.
+struct InterpreterRegistersState {
+    kernel_mode: bool,
+    context: usize,
+    registers: RegistersState,
+}
+
+/// Interpreter state at the last checkpoint: we only need to store
+/// the state of the registers and the length of the vector of memory operations.
+/// This data is enough to revert in case of an exception.
+struct InterpreterCheckpoint {
+    registers: InterpreterRegistersState,
+    mem_len: usize,
 }
 
-pub fn run_interpreter(
+pub(crate) fn run_interpreter(
     initial_offset: usize,
     initial_stack: Vec<U256>,
 ) -> anyhow::Result<Interpreter<'static>> {
@@ -61,14 +93,14 @@ pub fn run_interpreter(
 }
 
 #[derive(Clone)]
-pub struct InterpreterMemoryInitialization {
+pub(crate) struct InterpreterMemoryInitialization {
     pub label: String,
     pub stack: Vec<U256>,
     pub segment: Segment,
     pub memory: Vec<(usize, Vec<U256>)>,
 }
 
-pub fn run_interpreter_with_memory(
+pub(crate) fn run_interpreter_with_memory(
     memory_init: InterpreterMemoryInitialization,
 ) -> anyhow::Result<Interpreter<'static>> {
     let label = KERNEL.global_labels[&memory_init.label];
@@ -87,7 +119,7 @@ pub fn run_interpreter_with_memory(
     Ok(interpreter)
 }
 
-pub fn run<'a>(
+pub(crate) fn run<'a>(
     code: &'a [u8],
     initial_offset: usize,
     initial_stack: Vec<U256>,
@@ -98,14 +130,38 @@ pub fn run<'a>(
     Ok(interpreter)
 }
 
+/// Different types of Memory operations in the interpreter, and the data required to revert them.
+enum InterpreterMemOpKind {
+    /// We need to provide the context.
+    Push(usize),
+    /// If we pop a certain value, we need to push it back to the correct context when reverting.
+    Pop(U256, usize),
+    /// If we write a value at a certain address, we need to write the old value back when reverting.
+    Write(U256, usize, usize, usize),
+}
+
 impl<'a> Interpreter<'a> {
     pub(crate) fn new_with_kernel(initial_offset: usize, initial_stack: Vec<U256>) -> Self {
-        Self::new(
+        let mut result = Self::new(
             &KERNEL.code,
             initial_offset,
             initial_stack,
             &KERNEL.prover_inputs,
-        )
+        );
+        result.initialize_rlp_segment();
+        result
+    }
+
+    /// Returns an instance of `Interpreter` given `GenerationInputs`, and assuming we are
+    /// initializing with the `KERNEL` code.
+    pub(crate) fn new_with_generation_inputs_and_kernel(
+        initial_offset: usize,
+        initial_stack: Vec<U256>,
+        inputs: GenerationInputs,
+    ) -> Self {
+        let mut result = Self::new_with_kernel(initial_offset, initial_stack);
+        result.initialize_interpreter_state_with_kernel(inputs);
+        result
     }
 
     pub(crate) fn new(
@@ -115,15 +171,16 @@ impl<'a> Interpreter<'a> {
         prover_inputs: &'a HashMap<usize, ProverInputFn>,
     ) -> Self {
         let mut result = Self {
-            kernel_mode: true,
-            jumpdests: find_jumpdests(code),
-            generation_state: GenerationState::new(GenerationInputs::default(), code).unwrap(),
+            generation_state: GenerationState::new(GenerationInputs::default(), code)
+                .expect("Default inputs are known-good"),
             prover_inputs_map: prover_inputs,
-            context: 0,
-            halt_offsets: vec![DEFAULT_HALT_OFFSET],
+            // `DEFAULT_HALT_OFFSET` is used as a halting point for the interpreter,
+            // while the label `halt` is the halting label in the kernel.
+            halt_offsets: vec![DEFAULT_HALT_OFFSET, KERNEL.global_labels["halt"]],
             debug_offsets: vec![],
             running: false,
-            opcode_count: [0; 0x100],
+            opcode_count: [0; 256],
+            memops: vec![],
         };
         result.generation_state.registers.program_counter = initial_offset;
         let initial_stack_len = initial_stack.len();
@@ -137,10 +194,233 @@ impl<'a> Interpreter<'a> {
         result
     }
 
+    /// Initializes the interpreter state given `GenerationInputs`, using the KERNEL code.
+    pub(crate) fn initialize_interpreter_state_with_kernel(&mut self, inputs: GenerationInputs) {
+        self.initialize_interpreter_state(inputs, KERNEL.code_hash, KERNEL.code.len());
+    }
+
+    /// Initializes the interpreter state given `GenerationInputs`.
+    pub(crate) fn initialize_interpreter_state(
+        &mut self,
+        inputs: GenerationInputs,
+        kernel_hash: H256,
+        kernel_code_len: usize,
+    ) {
+        let tries = &inputs.tries;
+
+        // Set state's inputs.
+        self.generation_state.inputs = inputs.clone();
+
+        // Initialize the MPT's pointers.
+        let (trie_root_ptrs, trie_data) =
+            load_all_mpts(tries).expect("Invalid MPT data for preinitialization");
+        let trie_roots_after = &inputs.trie_roots_after;
+        self.generation_state.trie_root_ptrs = trie_root_ptrs;
+
+        // Initialize the `TrieData` segment.
+        for (i, data) in trie_data.iter().enumerate() {
+            let trie_addr = MemoryAddress::new(0, Segment::TrieData, i);
+            self.generation_state.memory.set(trie_addr, data.into());
+        }
+
+        // Update the RLP and withdrawal prover inputs.
+        let rlp_prover_inputs =
+            all_rlp_prover_inputs_reversed(inputs.clone().signed_txn.as_ref().unwrap_or(&vec![]));
+        let withdrawal_prover_inputs = all_withdrawals_prover_inputs_reversed(&inputs.withdrawals);
+        self.generation_state.rlp_prover_inputs = rlp_prover_inputs;
+        self.generation_state.withdrawal_prover_inputs = withdrawal_prover_inputs;
+
+        // Set `GlobalMetadata` values.
+        let metadata = &inputs.block_metadata;
+        let global_metadata_to_set = [
+            (
+                GlobalMetadata::BlockBeneficiary,
+                U256::from_big_endian(&metadata.block_beneficiary.0),
+            ),
+            (GlobalMetadata::BlockTimestamp, metadata.block_timestamp),
+            (GlobalMetadata::BlockNumber, metadata.block_number),
+            (GlobalMetadata::BlockDifficulty, metadata.block_difficulty),
+            (
+                GlobalMetadata::BlockRandom,
+                metadata.block_random.into_uint(),
+            ),
+            (GlobalMetadata::BlockGasLimit, metadata.block_gaslimit),
+            (GlobalMetadata::BlockChainId, metadata.block_chain_id),
+            (GlobalMetadata::BlockBaseFee, metadata.block_base_fee),
+            (
+                GlobalMetadata::BlockCurrentHash,
+                h2u(inputs.block_hashes.cur_hash),
+            ),
+            (GlobalMetadata::BlockGasUsed, metadata.block_gas_used),
+            (GlobalMetadata::BlockGasUsedBefore, inputs.gas_used_before),
+            (GlobalMetadata::BlockGasUsedAfter, inputs.gas_used_after),
+            (GlobalMetadata::TxnNumberBefore, inputs.txn_number_before),
+            (
+                GlobalMetadata::TxnNumberAfter,
+                inputs.txn_number_before + if inputs.signed_txn.is_some() { 1 } else { 0 },
+            ),
+            (
+                GlobalMetadata::StateTrieRootDigestBefore,
+                h2u(tries.state_trie.hash()),
+            ),
+            (
+                GlobalMetadata::TransactionTrieRootDigestBefore,
+                h2u(tries.transactions_trie.hash()),
+            ),
+            (
+                GlobalMetadata::ReceiptTrieRootDigestBefore,
+                h2u(tries.receipts_trie.hash()),
+            ),
+            (
+                GlobalMetadata::StateTrieRootDigestAfter,
+                h2u(trie_roots_after.state_root),
+            ),
+            (
+                GlobalMetadata::TransactionTrieRootDigestAfter,
+                h2u(trie_roots_after.transactions_root),
+            ),
+            (
+                GlobalMetadata::ReceiptTrieRootDigestAfter,
+                h2u(trie_roots_after.receipts_root),
+            ),
+            (GlobalMetadata::KernelHash, h2u(kernel_hash)),
+            (GlobalMetadata::KernelLen, kernel_code_len.into()),
+        ];
+
+        self.set_global_metadata_multi_fields(&global_metadata_to_set);
+
+        // Set final block bloom values.
+        let final_block_bloom_fields = (0..8)
+            .map(|i| {
+                (
+                    MemoryAddress::new_u256s(
+                        U256::zero(),
+                        (Segment::GlobalBlockBloom.unscale()).into(),
+                        i.into(),
+                    )
+                    .unwrap(),
+                    metadata.block_bloom[i],
+                )
+            })
+            .collect::<Vec<_>>();
+
+        self.set_memory_multi_addresses(&final_block_bloom_fields);
+
+        // Set previous block hash.
+        let block_hashes_fields = (0..256)
+            .map(|i| {
+                (
+                    MemoryAddress::new_u256s(
+                        U256::zero(),
+                        (Segment::BlockHashes.unscale()).into(),
+                        i.into(),
+                    )
+                    .unwrap(),
+                    h2u(inputs.block_hashes.prev_hashes[i]),
+                )
+            })
+            .collect::<Vec<_>>();
+
+        self.set_memory_multi_addresses(&block_hashes_fields);
+    }
+
+    fn checkpoint(&self) -> InterpreterCheckpoint {
+        let registers = InterpreterRegistersState {
+            kernel_mode: self.is_kernel(),
+            context: self.context(),
+            registers: self.generation_state.registers,
+        };
+        InterpreterCheckpoint {
+            registers,
+            mem_len: self.memops.len(),
+        }
+    }
+
+    fn roll_memory_back(&mut self, len: usize) {
+        // We roll the memory back until `memops` reaches length `len`.
+        debug_assert!(self.memops.len() >= len);
+        while self.memops.len() > len {
+            if let Some(op) = self.memops.pop() {
+                match op {
+                    InterpreterMemOpKind::Push(context) => {
+                        self.generation_state.memory.contexts[context].segments
+                            [Segment::Stack.unscale()]
+                        .content
+                        .pop();
+                    }
+                    InterpreterMemOpKind::Pop(value, context) => {
+                        self.generation_state.memory.contexts[context].segments
+                            [Segment::Stack.unscale()]
+                        .content
+                        .push(value)
+                    }
+                    InterpreterMemOpKind::Write(value, context, segment, offset) => {
+                        self.generation_state.memory.contexts[context].segments
+                            [segment >> SEGMENT_SCALING_FACTOR] // we need to unscale the segment value
+                            .content[offset] = value
+                    }
+                }
+            }
+        }
+    }
+
+    fn rollback(&mut self, checkpoint: InterpreterCheckpoint) {
+        let InterpreterRegistersState {
+            kernel_mode,
+            context,
+            registers,
+        } = checkpoint.registers;
+        self.set_is_kernel(kernel_mode);
+        self.set_context(context);
+        self.generation_state.registers = registers;
+        self.roll_memory_back(checkpoint.mem_len);
+    }
+
+    fn handle_error(&mut self, err: ProgramError) -> anyhow::Result<()> {
+        let exc_code: u8 = match err {
+            ProgramError::OutOfGas => 0,
+            ProgramError::InvalidOpcode => 1,
+            ProgramError::StackUnderflow => 2,
+            ProgramError::InvalidJumpDestination => 3,
+            ProgramError::InvalidJumpiDestination => 4,
+            ProgramError::StackOverflow => 5,
+            _ => bail!("TODO: figure out what to do with this..."),
+        };
+
+        self.run_exception(exc_code)
+            .map_err(|_| anyhow::Error::msg("error handling errored..."))
+    }
+
     pub(crate) fn run(&mut self) -> anyhow::Result<()> {
         self.running = true;
         while self.running {
-            self.run_opcode()?;
+            let pc = self.generation_state.registers.program_counter;
+            if self.is_kernel() && self.halt_offsets.contains(&pc) {
+                return Ok(());
+            };
+
+            let checkpoint = self.checkpoint();
+            let result = self.run_opcode();
+            match result {
+                Ok(()) => Ok(()),
+                Err(e) => {
+                    if self.is_kernel() {
+                        let offset_name =
+                            KERNEL.offset_name(self.generation_state.registers.program_counter);
+                        bail!(
+                            "{:?} in kernel at pc={}, stack={:?}, memory={:?}",
+                            e,
+                            offset_name,
+                            self.stack(),
+                            self.generation_state.memory.contexts[0].segments
+                                [Segment::KernelGeneral.unscale()]
+                            .content,
+                        );
+                    }
+                    self.rollback(checkpoint);
+                    self.handle_error(e)
+                }
+            }?;
         }
         println!("Opcode count:");
         for i in 0..0x100 {
@@ -153,7 +433,9 @@ impl<'a> Interpreter<'a> {
     }
 
     fn code(&self) -> &MemorySegmentState {
-        &self.generation_state.memory.contexts[self.context].segments[Segment::Code as usize]
+        // The context is 0 if we are in kernel mode.
+        &self.generation_state.memory.contexts[(1 - self.is_kernel() as usize) * self.context()]
+            .segments[Segment::Code.unscale()]
     }
 
     fn code_slice(&self, n: usize) -> Vec<u8> {
@@ -165,45 +447,76 @@ impl<'a> Interpreter<'a> {
     }
 
     pub(crate) fn get_txn_field(&self, field: NormalizedTxnField) -> U256 {
-        self.generation_state.memory.contexts[0].segments[Segment::TxnFields as usize]
-            .get(field as usize)
+        // These fields are already scaled by their respective segment.
+        self.generation_state.memory.contexts[0].segments[Segment::TxnFields.unscale()]
+            .get(field.unscale())
     }
 
     pub(crate) fn set_txn_field(&mut self, field: NormalizedTxnField, value: U256) {
-        self.generation_state.memory.contexts[0].segments[Segment::TxnFields as usize]
-            .set(field as usize, value);
+        // These fields are already scaled by their respective segment.
+        self.generation_state.memory.contexts[0].segments[Segment::TxnFields.unscale()]
+            .set(field.unscale(), value);
     }
 
     pub(crate) fn get_txn_data(&self) -> &[U256] {
-        &self.generation_state.memory.contexts[0].segments[Segment::TxnData as usize].content
+        &self.generation_state.memory.contexts[0].segments[Segment::TxnData.unscale()].content
+    }
+
+    pub(crate) fn get_context_metadata_field(&self, ctx: usize, field: ContextMetadata) -> U256 {
+        // These fields are already scaled by their respective segment.
+        self.generation_state.memory.contexts[ctx].segments[Segment::ContextMetadata.unscale()]
+            .get(field.unscale())
+    }
+
+    pub(crate) fn set_context_metadata_field(
+        &mut self,
+        ctx: usize,
+        field: ContextMetadata,
+        value: U256,
+    ) {
+        // These fields are already scaled by their respective segment.
+        self.generation_state.memory.contexts[ctx].segments[Segment::ContextMetadata.unscale()]
+            .set(field.unscale(), value)
     }
 
     pub(crate) fn get_global_metadata_field(&self, field: GlobalMetadata) -> U256 {
-        self.generation_state.memory.contexts[0].segments[Segment::GlobalMetadata as usize]
-            .get(field as usize)
+        // These fields are already scaled by their respective segment.
+        let field = field.unscale();
+        self.generation_state.memory.contexts[0].segments[Segment::GlobalMetadata.unscale()]
+            .get(field)
     }
 
     pub(crate) fn set_global_metadata_field(&mut self, field: GlobalMetadata, value: U256) {
-        self.generation_state.memory.contexts[0].segments[Segment::GlobalMetadata as usize]
-            .set(field as usize, value)
+        // These fields are already scaled by their respective segment.
+        let field = field.unscale();
+        self.generation_state.memory.contexts[0].segments[Segment::GlobalMetadata.unscale()]
+            .set(field, value)
+    }
+
+    pub(crate) fn set_global_metadata_multi_fields(&mut self, metadata: &[(GlobalMetadata, U256)]) {
+        for &(field, value) in metadata {
+            let field = field.unscale();
+            self.generation_state.memory.contexts[0].segments[Segment::GlobalMetadata.unscale()]
+                .set(field, value);
+        }
     }
 
     pub(crate) fn get_trie_data(&self) -> &[U256] {
-        &self.generation_state.memory.contexts[0].segments[Segment::TrieData as usize].content
+        &self.generation_state.memory.contexts[0].segments[Segment::TrieData.unscale()].content
     }
 
     pub(crate) fn get_trie_data_mut(&mut self) -> &mut Vec<U256> {
-        &mut self.generation_state.memory.contexts[0].segments[Segment::TrieData as usize].content
+        &mut self.generation_state.memory.contexts[0].segments[Segment::TrieData.unscale()].content
     }
 
     pub(crate) fn get_memory_segment(&self, segment: Segment) -> Vec<U256> {
-        self.generation_state.memory.contexts[0].segments[segment as usize]
+        self.generation_state.memory.contexts[0].segments[segment.unscale()]
             .content
             .clone()
     }
 
     pub(crate) fn get_memory_segment_bytes(&self, segment: Segment) -> Vec<u8> {
-        self.generation_state.memory.contexts[0].segments[segment as usize]
+        self.generation_state.memory.contexts[0].segments[segment.unscale()]
             .content
             .iter()
             .map(|x| x.low_u32() as u8)
@@ -211,10 +524,10 @@ impl<'a> Interpreter<'a> {
     }
 
     pub(crate) fn get_current_general_memory(&self) -> Vec<U256> {
-        self.generation_state.memory.contexts[self.context].segments
-            [Segment::KernelGeneral as usize]
-            .content
-            .clone()
+        self.generation_state.memory.contexts[self.context()].segments
+            [Segment::KernelGeneral.unscale()]
+        .content
+        .clone()
     }
 
     pub(crate) fn get_kernel_general_memory(&self) -> Vec<U256> {
@@ -226,17 +539,17 @@ impl<'a> Interpreter<'a> {
     }
 
     pub(crate) fn set_current_general_memory(&mut self, memory: Vec<U256>) {
-        self.generation_state.memory.contexts[self.context].segments
-            [Segment::KernelGeneral as usize]
+        let context = self.context();
+        self.generation_state.memory.contexts[context].segments[Segment::KernelGeneral.unscale()]
             .content = memory;
     }
 
     pub(crate) fn set_memory_segment(&mut self, segment: Segment, memory: Vec<U256>) {
-        self.generation_state.memory.contexts[0].segments[segment as usize].content = memory;
+        self.generation_state.memory.contexts[0].segments[segment.unscale()].content = memory;
     }
 
     pub(crate) fn set_memory_segment_bytes(&mut self, segment: Segment, memory: Vec<u8>) {
-        self.generation_state.memory.contexts[0].segments[segment as usize].content =
+        self.generation_state.memory.contexts[0].segments[segment.unscale()].content =
             memory.into_iter().map(U256::from).collect();
     }
 
@@ -252,39 +565,71 @@ impl<'a> Interpreter<'a> {
                 .contexts
                 .push(MemoryContextState::default());
         }
-        self.generation_state.memory.contexts[context].segments[Segment::Code as usize].content =
+        self.generation_state.memory.set(
+            MemoryAddress::new(
+                context,
+                Segment::ContextMetadata,
+                ContextMetadata::CodeSize.unscale(),
+            ),
+            code.len().into(),
+        );
+        self.generation_state.memory.contexts[context].segments[Segment::Code.unscale()].content =
             code.into_iter().map(U256::from).collect();
     }
 
+    pub(crate) fn set_memory_multi_addresses(&mut self, addrs: &[(MemoryAddress, U256)]) {
+        for &(addr, val) in addrs {
+            self.generation_state.memory.set(addr, val);
+        }
+    }
+
     pub(crate) fn get_jumpdest_bits(&self, context: usize) -> Vec<bool> {
-        self.generation_state.memory.contexts[context].segments[Segment::JumpdestBits as usize]
+        self.generation_state.memory.contexts[context].segments[Segment::JumpdestBits.unscale()]
             .content
             .iter()
             .map(|x| x.bit(0))
             .collect()
     }
 
-    fn incr(&mut self, n: usize) {
+    pub(crate) fn set_jumpdest_analysis_inputs(&mut self, jumps: HashMap<usize, BTreeSet<usize>>) {
+        self.generation_state.set_jumpdest_analysis_inputs(jumps);
+    }
+
+    pub(crate) fn incr(&mut self, n: usize) {
         self.generation_state.registers.program_counter += n;
     }
 
     pub(crate) fn stack(&self) -> Vec<U256> {
-        let mut stack = self.generation_state.memory.contexts[self.context].segments
-            [Segment::Stack as usize]
-            .content
-            .clone();
-        if self.stack_len() > 0 {
-            stack.push(self.stack_top());
+        match self.stack_len().cmp(&1) {
+            Ordering::Greater => {
+                let mut stack = self.generation_state.memory.contexts[self.context()].segments
+                    [Segment::Stack.unscale()]
+                .content
+                .clone();
+                stack.truncate(self.stack_len() - 1);
+                stack.push(
+                    self.stack_top()
+                        .expect("The stack is checked to be nonempty"),
+                );
+                stack
+            }
+            Ordering::Equal => {
+                vec![self
+                    .stack_top()
+                    .expect("The stack is checked to be nonempty")]
+            }
+            Ordering::Less => {
+                vec![]
+            }
         }
-        stack
     }
-
     fn stack_segment_mut(&mut self) -> &mut Vec<U256> {
-        &mut self.generation_state.memory.contexts[self.context].segments[Segment::Stack as usize]
+        let context = self.context();
+        &mut self.generation_state.memory.contexts[context].segments[Segment::Stack.unscale()]
             .content
     }
 
-    pub fn extract_kernel_memory(self, segment: Segment, range: Range<usize>) -> Vec<U256> {
+    pub(crate) fn extract_kernel_memory(self, segment: Segment, range: Range<usize>) -> Vec<U256> {
         let mut output: Vec<U256> = vec![];
         for i in range {
             let term = self
@@ -296,147 +641,173 @@ impl<'a> Interpreter<'a> {
         output
     }
 
-    pub(crate) fn push(&mut self, x: U256) {
+    pub(crate) fn push(&mut self, x: U256) -> Result<(), ProgramError> {
+        if !self.is_kernel() && self.stack_len() >= MAX_USER_STACK_SIZE {
+            return Err(ProgramError::StackOverflow);
+        }
         if self.stack_len() > 0 {
-            let top = self.stack_top();
-            self.stack_segment_mut().push(top);
+            let top = self
+                .stack_top()
+                .expect("The stack is checked to be nonempty");
+            let cur_len = self.stack_len();
+            let stack_addr = MemoryAddress::new(self.context(), Segment::Stack, cur_len - 1);
+            self.generation_state.memory.set(stack_addr, top);
         }
         self.generation_state.registers.stack_top = x;
         self.generation_state.registers.stack_len += 1;
+        self.memops.push(InterpreterMemOpKind::Push(self.context()));
+
+        Ok(())
     }
 
-    fn push_bool(&mut self, x: bool) {
-        self.push(if x { U256::one() } else { U256::zero() });
+    fn push_bool(&mut self, x: bool) -> Result<(), ProgramError> {
+        self.push(if x { U256::one() } else { U256::zero() })
     }
 
-    pub(crate) fn pop(&mut self) -> U256 {
+    pub(crate) fn pop(&mut self) -> Result<U256, ProgramError> {
         let result = stack_peek(&self.generation_state, 0);
+
+        if let Ok(val) = result {
+            self.memops
+                .push(InterpreterMemOpKind::Pop(val, self.context()));
+        }
         if self.stack_len() > 1 {
             let top = stack_peek(&self.generation_state, 1).unwrap();
             self.generation_state.registers.stack_top = top;
         }
         self.generation_state.registers.stack_len -= 1;
-        let new_len = self.stack_len();
-        if new_len > 0 {
-            self.stack_segment_mut().truncate(new_len - 1);
-        } else {
-            self.stack_segment_mut().truncate(0);
-        }
-        result.expect("Empty stack")
+
+        result
     }
 
-    fn run_opcode(&mut self) -> anyhow::Result<()> {
+    fn run_opcode(&mut self) -> Result<(), ProgramError> {
         let opcode = self
             .code()
             .get(self.generation_state.registers.program_counter)
             .byte(0);
         self.opcode_count[opcode as usize] += 1;
         self.incr(1);
-
         match opcode {
-            0x00 => self.run_stop(),                                    // "STOP",
-            0x01 => self.run_add(),                                     // "ADD",
-            0x02 => self.run_mul(),                                     // "MUL",
-            0x03 => self.run_sub(),                                     // "SUB",
-            0x04 => self.run_div(),                                     // "DIV",
-            0x05 => self.run_sdiv(),                                    // "SDIV",
-            0x06 => self.run_mod(),                                     // "MOD",
-            0x07 => self.run_smod(),                                    // "SMOD",
-            0x08 => self.run_addmod(),                                  // "ADDMOD",
-            0x09 => self.run_mulmod(),                                  // "MULMOD",
-            0x0a => self.run_exp(),                                     // "EXP",
-            0x0b => self.run_signextend(),                              // "SIGNEXTEND",
-            0x0c => self.run_addfp254(),                                // "ADDFP254",
-            0x0d => self.run_mulfp254(),                                // "MULFP254",
-            0x0e => self.run_subfp254(),                                // "SUBFP254",
-            0x0f => self.run_submod(),                                  // "SUBMOD",
-            0x10 => self.run_lt(),                                      // "LT",
-            0x11 => self.run_gt(),                                      // "GT",
-            0x12 => self.run_slt(),                                     // "SLT",
-            0x13 => self.run_sgt(),                                     // "SGT",
-            0x14 => self.run_eq(),                                      // "EQ",
-            0x15 => self.run_iszero(),                                  // "ISZERO",
-            0x16 => self.run_and(),                                     // "AND",
-            0x17 => self.run_or(),                                      // "OR",
-            0x18 => self.run_xor(),                                     // "XOR",
-            0x19 => self.run_not(),                                     // "NOT",
-            0x1a => self.run_byte(),                                    // "BYTE",
-            0x1b => self.run_shl(),                                     // "SHL",
-            0x1c => self.run_shr(),                                     // "SHR",
-            0x1d => self.run_sar(),                                     // "SAR",
-            0x20 => self.run_keccak256(),                               // "KECCAK256",
-            0x21 => self.run_keccak_general(),                          // "KECCAK_GENERAL",
-            0x30 => self.run_address(),                                 // "ADDRESS",
-            0x31 => todo!(),                                            // "BALANCE",
-            0x32 => self.run_origin(),                                  // "ORIGIN",
-            0x33 => self.run_caller(),                                  // "CALLER",
-            0x34 => self.run_callvalue(),                               // "CALLVALUE",
-            0x35 => self.run_calldataload(),                            // "CALLDATALOAD",
-            0x36 => self.run_calldatasize(),                            // "CALLDATASIZE",
-            0x37 => self.run_calldatacopy(),                            // "CALLDATACOPY",
-            0x38 => self.run_codesize(),                                // "CODESIZE",
-            0x39 => self.run_codecopy(),                                // "CODECOPY",
-            0x3a => self.run_gasprice(),                                // "GASPRICE",
-            0x3b => todo!(),                                            // "EXTCODESIZE",
-            0x3c => todo!(),                                            // "EXTCODECOPY",
-            0x3d => self.run_returndatasize(),                          // "RETURNDATASIZE",
-            0x3e => self.run_returndatacopy(),                          // "RETURNDATACOPY",
-            0x3f => todo!(),                                            // "EXTCODEHASH",
-            0x40 => todo!(),                                            // "BLOCKHASH",
-            0x41 => self.run_coinbase(),                                // "COINBASE",
-            0x42 => self.run_timestamp(),                               // "TIMESTAMP",
-            0x43 => self.run_number(),                                  // "NUMBER",
-            0x44 => self.run_difficulty(),                              // "DIFFICULTY",
-            0x45 => self.run_gaslimit(),                                // "GASLIMIT",
-            0x46 => self.run_chainid(),                                 // "CHAINID",
-            0x48 => self.run_basefee(),                                 // "BASEFEE",
-            0x49 => self.run_prover_input()?,                           // "PROVER_INPUT",
-            0x4a => self.run_blobbasefee(),                             // "BLOBBASEFEE",
-            0x50 => self.run_pop(),                                     // "POP",
-            0x51 => self.run_mload(),                                   // "MLOAD",
-            0x52 => self.run_mstore(),                                  // "MSTORE",
-            0x53 => self.run_mstore8(),                                 // "MSTORE8",
-            0x54 => todo!(),                                            // "SLOAD",
-            0x55 => todo!(),                                            // "SSTORE",
-            0x56 => self.run_jump(),                                    // "JUMP",
-            0x57 => self.run_jumpi(),                                   // "JUMPI",
-            0x58 => self.run_pc(),                                      // "PC",
-            0x59 => self.run_msize(),                                   // "MSIZE",
-            0x5a => todo!(),                                            // "GAS",
-            0x5b => self.run_jumpdest(),                                // "JUMPDEST",
-            0x5e => self.run_mcopy(),                                   // "MCOPY",
-            x if (0x5f..0x80).contains(&x) => self.run_push(x - 0x5f),  // "PUSH"
-            x if (0x80..0x90).contains(&x) => self.run_dup(x - 0x7f),   // "DUP"
-            x if (0x90..0xa0).contains(&x) => self.run_swap(x - 0x8f)?, // "SWAP"
-            0xa0 => todo!(),                                            // "LOG0",
-            0xa1 => todo!(),                                            // "LOG1",
-            0xa2 => todo!(),                                            // "LOG2",
-            0xa3 => todo!(),                                            // "LOG3",
-            0xa4 => todo!(),                                            // "LOG4",
-            0xa5 => bail!(
-                "Executed PANIC, stack={:?}, memory={:?}",
-                self.stack(),
-                self.get_kernel_general_memory()
-            ), // "PANIC",
-            0xee => self.run_mstore_32bytes(),                          // "MSTORE_32BYTES",
-            0xf0 => todo!(),                                            // "CREATE",
-            0xf1 => todo!(),                                            // "CALL",
-            0xf2 => todo!(),                                            // "CALLCODE",
-            0xf3 => todo!(),                                            // "RETURN",
-            0xf4 => todo!(),                                            // "DELEGATECALL",
-            0xf5 => todo!(),                                            // "CREATE2",
-            0xf6 => self.run_get_context(),                             // "GET_CONTEXT",
-            0xf7 => self.run_set_context(),                             // "SET_CONTEXT",
-            0xf8 => self.run_mload_32bytes(),                           // "MLOAD_32BYTES",
-            0xf9 => todo!(),                                            // "EXIT_KERNEL",
-            0xfa => todo!(),                                            // "STATICCALL",
-            0xfb => self.run_mload_general(),                           // "MLOAD_GENERAL",
-            0xfc => self.run_mstore_general(),                          // "MSTORE_GENERAL",
-            0xfd => todo!(),                                            // "REVERT",
-            0xfe => bail!("Executed INVALID"),                          // "INVALID",
-            0xff => todo!(),                                            // "SELFDESTRUCT",
-            _ => bail!("Unrecognized opcode {}.", opcode),
-        };
+            0x00 => self.run_syscall(opcode, 0, false), // "STOP",
+            0x01 => self.run_add(),                     // "ADD",
+            0x02 => self.run_mul(),                     // "MUL",
+            0x03 => self.run_sub(),                     // "SUB",
+            0x04 => self.run_div(),                     // "DIV",
+            0x05 => self.run_syscall(opcode, 2, false), // "SDIV",
+            0x06 => self.run_mod(),                     // "MOD",
+            0x07 => self.run_syscall(opcode, 2, false), // "SMOD",
+            0x08 => self.run_addmod(),                  // "ADDMOD",
+            0x09 => self.run_mulmod(),                  // "MULMOD",
+            0x0a => self.run_syscall(opcode, 2, false), // "EXP",
+            0x0b => self.run_syscall(opcode, 2, false), // "SIGNEXTEND",
+            0x0c => self.run_addfp254(),                // "ADDFP254",
+            0x0d => self.run_mulfp254(),                // "MULFP254",
+            0x0e => self.run_subfp254(),                // "SUBFP254",
+            0x0f => self.run_submod(),                  // "SUBMOD",
+            0x10 => self.run_lt(),                      // "LT",
+            0x11 => self.run_gt(),                      // "GT",
+            0x12 => self.run_syscall(opcode, 2, false), // "SLT",
+            0x13 => self.run_syscall(opcode, 2, false), // "SGT",
+            0x14 => self.run_eq(),                      // "EQ",
+            0x15 => self.run_iszero(),                  // "ISZERO",
+            0x16 => self.run_and(),                     // "AND",
+            0x17 => self.run_or(),                      // "OR",
+            0x18 => self.run_xor(),                     // "XOR",
+            0x19 => self.run_not(),                     // "NOT",
+            0x1a => self.run_byte(),                    // "BYTE",
+            0x1b => self.run_shl(),                     // "SHL",
+            0x1c => self.run_shr(),                     // "SHR",
+            0x1d => self.run_syscall(opcode, 2, false), // "SAR",
+            0x20 => self.run_syscall(opcode, 2, false), // "KECCAK256",
+            0x21 => self.run_keccak_general(),          // "KECCAK_GENERAL",
+            0x30 => self.run_syscall(opcode, 0, true),  // "ADDRESS",
+            0x31 => self.run_syscall(opcode, 1, false), // "BALANCE",
+            0x32 => self.run_syscall(opcode, 0, true),  // "ORIGIN",
+            0x33 => self.run_syscall(opcode, 0, true),  // "CALLER",
+            0x34 => self.run_syscall(opcode, 0, true),  // "CALLVALUE",
+            0x35 => self.run_syscall(opcode, 1, false), // "CALLDATALOAD",
+            0x36 => self.run_syscall(opcode, 0, true),  // "CALLDATASIZE",
+            0x37 => self.run_syscall(opcode, 3, false), // "CALLDATACOPY",
+            0x38 => self.run_syscall(opcode, 0, true),  // "CODESIZE",
+            0x39 => self.run_syscall(opcode, 3, false), // "CODECOPY",
+            0x3a => self.run_syscall(opcode, 0, true),  // "GASPRICE",
+            0x3b => self.run_syscall(opcode, 1, false), // "EXTCODESIZE",
+            0x3c => self.run_syscall(opcode, 4, false), // "EXTCODECOPY",
+            0x3d => self.run_syscall(opcode, 0, true),  // "RETURNDATASIZE",
+            0x3e => self.run_syscall(opcode, 3, false), // "RETURNDATACOPY",
+            0x3f => self.run_syscall(opcode, 1, false), // "EXTCODEHASH",
+            0x40 => self.run_syscall(opcode, 1, false), // "BLOCKHASH",
+            0x41 => self.run_syscall(opcode, 0, true),  // "COINBASE",
+            0x42 => self.run_syscall(opcode, 0, true),  // "TIMESTAMP",
+            0x43 => self.run_syscall(opcode, 0, true),  // "NUMBER",
+            0x44 => self.run_syscall(opcode, 0, true),  // "DIFFICULTY",
+            0x45 => self.run_syscall(opcode, 0, true),  // "GASLIMIT",
+            0x46 => self.run_syscall(opcode, 0, true),  // "CHAINID",
+            0x47 => self.run_syscall(opcode, 0, true),  // SELFABALANCE,
+            0x48 => self.run_syscall(opcode, 0, true),  // "BASEFEE",
+            0x49 => self.run_prover_input(),            // "PROVER_INPUT",
+            0x4a => self.run_syscall(opcode, 0, true),  // "BLOBBASEFEE",
+            0x50 => self.run_pop(),                     // "POP",
+            0x51 => self.run_syscall(opcode, 1, false), // "MLOAD",
+            0x52 => self.run_syscall(opcode, 2, false), // "MSTORE",
+            0x53 => self.run_syscall(opcode, 2, false), // "MSTORE8",
+            0x54 => self.run_syscall(opcode, 1, false), // "SLOAD",
+            0x55 => self.run_syscall(opcode, 2, false), // "SSTORE",
+            0x56 => self.run_jump(),                    // "JUMP",
+            0x57 => self.run_jumpi(),                   // "JUMPI",
+            0x58 => self.run_pc(),                      // "PC",
+            0x59 => self.run_syscall(opcode, 0, true),  // "MSIZE",
+            0x5a => self.run_syscall(opcode, 0, true),  // "GAS",
+            0x5b => self.run_jumpdest(),                // "JUMPDEST",
+            0x5e => self.run_syscall(opcode, 3, false), // "MCOPY",
+            x if (0x5f..0x80).contains(&x) => self.run_push(x - 0x5f), // "PUSH"
+            x if (0x80..0x90).contains(&x) => self.run_dup(x - 0x7f), // "DUP"
+            x if (0x90..0xa0).contains(&x) => self.run_swap(x - 0x8f), // "SWAP"
+            0xa0 => self.run_syscall(opcode, 2, false), // "LOG0",
+            0xa1 => self.run_syscall(opcode, 3, false), // "LOG1",
+            0xa2 => self.run_syscall(opcode, 4, false), // "LOG2",
+            0xa3 => self.run_syscall(opcode, 5, false), // "LOG3",
+            0xa4 => self.run_syscall(opcode, 6, false), // "LOG4",
+            0xa5 => {
+                log::warn!(
+                    "Kernel panic at {}, stack = {:?}, memory = {:?}",
+                    KERNEL.offset_name(self.generation_state.registers.program_counter),
+                    self.stack(),
+                    self.get_kernel_general_memory()
+                );
+                Err(ProgramError::KernelPanic)
+            } // "PANIC",
+            x if (0xc0..0xe0).contains(&x) => self.run_mstore_32bytes(x - 0xc0 + 1), // "MSTORE_32BYTES",
+            0xf0 => self.run_syscall(opcode, 3, false),                              // "CREATE",
+            0xf1 => self.run_syscall(opcode, 7, false),                              // "CALL",
+            0xf2 => self.run_syscall(opcode, 7, false),                              // "CALLCODE",
+            0xf3 => self.run_syscall(opcode, 2, false),                              // "RETURN",
+            0xf4 => self.run_syscall(opcode, 6, false), // "DELEGATECALL",
+            0xf5 => self.run_syscall(opcode, 4, false), // "CREATE2",
+            0xf6 => self.run_get_context(),             // "GET_CONTEXT",
+            0xf7 => self.run_set_context(),             // "SET_CONTEXT",
+            0xf8 => self.run_mload_32bytes(),           // "MLOAD_32BYTES",
+            0xf9 => self.run_exit_kernel(),             // "EXIT_KERNEL",
+            0xfa => self.run_syscall(opcode, 6, false), // "STATICCALL",
+            0xfb => self.run_mload_general(),           // "MLOAD_GENERAL",
+            0xfc => self.run_mstore_general(),          // "MSTORE_GENERAL",
+            0xfd => self.run_syscall(opcode, 2, false), // "REVERT",
+            0xfe => {
+                log::warn!(
+                    "Invalid opcode at {}",
+                    KERNEL.offset_name(self.generation_state.registers.program_counter),
+                );
+                Err(ProgramError::InvalidOpcode)
+            } // "INVALID",
+            0xff => self.run_syscall(opcode, 1, false), // "SELFDESTRUCT",
+            _ => {
+                log::warn!(
+                    "Unrecognized opcode at {}",
+                    KERNEL.offset_name(self.generation_state.registers.program_counter),
+                );
+                Err(ProgramError::InvalidOpcode)
+            }
+        }?;
 
         if self
             .debug_offsets
@@ -447,6 +818,24 @@ impl<'a> Interpreter<'a> {
             println!("At {label}");
         }
 
+        let op = decode(self.generation_state.registers, opcode)
+            // We default to prover inputs, as those are kernel-only instructions that charge nothing.
+            .unwrap_or(Operation::ProverInput);
+        self.generation_state.registers.gas_used += gas_to_charge(op);
+
+        if !self.is_kernel() {
+            let gas_limit_address = MemoryAddress {
+                context: self.context(),
+                segment: Segment::ContextMetadata.unscale(),
+                virt: ContextMetadata::GasLimit.unscale(),
+            };
+            let gas_limit =
+                u256_to_usize(self.generation_state.memory.get(gas_limit_address))? as u64;
+            if self.generation_state.registers.gas_used > gas_limit {
+                return Err(ProgramError::OutOfGas);
+            }
+        }
+
         Ok(())
     }
 
@@ -458,318 +847,175 @@ impl<'a> Interpreter<'a> {
         KERNEL.offset_label(self.generation_state.registers.program_counter)
     }
 
-    fn run_stop(&mut self) {
-        self.running = false;
-    }
-
-    fn run_add(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(x.overflowing_add(y).0);
+    fn run_add(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(x.overflowing_add(y).0)
     }
 
-    fn run_mul(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(x.overflowing_mul(y).0);
+    fn run_mul(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(x.overflowing_mul(y).0)
     }
 
-    fn run_sub(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(x.overflowing_sub(y).0);
+    fn run_sub(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(x.overflowing_sub(y).0)
     }
 
-    fn run_addfp254(&mut self) {
-        let x = self.pop() % BN_BASE;
-        let y = self.pop() % BN_BASE;
+    fn run_addfp254(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()? % BN_BASE;
+        let y = self.pop()? % BN_BASE;
         // BN_BASE is 254-bit so addition can't overflow
-        self.push((x + y) % BN_BASE);
+        self.push((x + y) % BN_BASE)
     }
 
-    fn run_mulfp254(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(U256::try_from(x.full_mul(y) % BN_BASE).unwrap());
+    fn run_mulfp254(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(
+            U256::try_from(x.full_mul(y) % BN_BASE)
+                .expect("BN_BASE is 254 bit so the U512 fits in a U256"),
+        )
     }
 
-    fn run_subfp254(&mut self) {
-        let x = self.pop() % BN_BASE;
-        let y = self.pop() % BN_BASE;
+    fn run_subfp254(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()? % BN_BASE;
+        let y = self.pop()? % BN_BASE;
         // BN_BASE is 254-bit so addition can't overflow
-        self.push((x + (BN_BASE - y)) % BN_BASE);
-    }
-
-    fn run_div(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(if y.is_zero() { U256::zero() } else { x / y });
-    }
-
-    fn run_sdiv(&mut self) {
-        let mut x = self.pop();
-        let mut y = self.pop();
-
-        let y_is_zero = y.is_zero();
-
-        if y_is_zero {
-            self.push(U256::zero());
-        } else if y.eq(&MINUS_ONE) && x.eq(&MIN_VALUE) {
-            self.push(MIN_VALUE);
-        } else {
-            let x_is_pos = x.eq(&(x & SIGN_MASK));
-            let y_is_pos = y.eq(&(y & SIGN_MASK));
-
-            // We compute the absolute quotient first,
-            // then adapt its sign based on the operands.
-            if !x_is_pos {
-                x = two_complement(x);
-            }
-            if !y_is_pos {
-                y = two_complement(y);
-            }
-            let div = x / y;
-            if div.eq(&U256::zero()) {
-                self.push(U256::zero());
-            }
-
-            self.push(if x_is_pos == y_is_pos {
-                div
-            } else {
-                two_complement(div)
-            });
-        }
+        self.push((x + (BN_BASE - y)) % BN_BASE)
     }
 
-    fn run_mod(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(if y.is_zero() { U256::zero() } else { x % y });
+    fn run_div(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(if y.is_zero() { U256::zero() } else { x / y })
     }
 
-    fn run_smod(&mut self) {
-        let mut x = self.pop();
-        let mut y = self.pop();
-
-        if y.is_zero() {
-            self.push(U256::zero());
-        } else {
-            let x_is_pos = x.eq(&(x & SIGN_MASK));
-            let y_is_pos = y.eq(&(y & SIGN_MASK));
-
-            // We compute the absolute remainder first,
-            // then adapt its sign based on the operands.
-            if !x_is_pos {
-                x = two_complement(x);
-            }
-            if !y_is_pos {
-                y = two_complement(y);
-            }
-            let rem = x % y;
-            if rem.eq(&U256::zero()) {
-                self.push(U256::zero());
-            }
-
-            // Remainder always has the same sign as the dividend.
-            self.push(if x_is_pos { rem } else { two_complement(rem) });
-        }
+    fn run_mod(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(if y.is_zero() { U256::zero() } else { x % y })
     }
 
-    fn run_addmod(&mut self) {
-        let x = U512::from(self.pop());
-        let y = U512::from(self.pop());
-        let z = U512::from(self.pop());
+    fn run_addmod(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        let z = self.pop()?;
         self.push(if z.is_zero() {
-            U256::zero()
+            z
         } else {
-            U256::try_from((x + y) % z).unwrap()
-        });
+            let (x, y, z) = (U512::from(x), U512::from(y), U512::from(z));
+            U256::try_from((x + y) % z)
+                .expect("Inputs are U256 and their sum mod a U256 fits in a U256.")
+        })
     }
 
-    fn run_submod(&mut self) {
-        let x = U512::from(self.pop());
-        let y = U512::from(self.pop());
-        let z = U512::from(self.pop());
+    fn run_submod(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        let z = self.pop()?;
         self.push(if z.is_zero() {
-            U256::zero()
+            z
         } else {
-            U256::try_from((z + x - y) % z).unwrap()
-        });
+            let (x, y, z) = (U512::from(x), U512::from(y), U512::from(z));
+            U256::try_from((z + x - y) % z)
+                .expect("Inputs are U256 and their difference mod a U256 fits in a U256.")
+        })
     }
 
-    fn run_mulmod(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        let z = U512::from(self.pop());
+    fn run_mulmod(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        let z = self.pop()?;
         self.push(if z.is_zero() {
-            U256::zero()
+            z
         } else {
-            U256::try_from(x.full_mul(y) % z).unwrap()
-        });
-    }
-
-    fn run_exp(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(x.overflowing_pow(y).0);
-    }
-
-    fn run_lt(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push_bool(x < y);
+            U256::try_from(x.full_mul(y) % z)
+                .expect("Inputs are U256 and their product mod a U256 fits in a U256.")
+        })
     }
 
-    fn run_gt(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push_bool(x > y);
+    fn run_lt(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push_bool(x < y)
     }
 
-    fn run_slt(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push_bool(signed_cmp(x, y) == Ordering::Less);
-    }
-
-    fn run_sgt(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push_bool(signed_cmp(x, y) == Ordering::Greater);
-    }
-
-    fn run_signextend(&mut self) {
-        let n = self.pop();
-        let x = self.pop();
-        if n > U256::from(31) {
-            self.push(x);
-        } else {
-            let n = n.low_u64() as usize;
-            let num_bytes_prepend = 31 - n;
-
-            let mut x_bytes = [0u8; 32];
-            x.to_big_endian(&mut x_bytes);
-            let x_bytes = x_bytes[num_bytes_prepend..].to_vec();
-            let sign_bit = x_bytes[0] >> 7;
-
-            let mut bytes = if sign_bit == 0 {
-                vec![0; num_bytes_prepend]
-            } else {
-                vec![0xff; num_bytes_prepend]
-            };
-            bytes.extend_from_slice(&x_bytes);
-
-            self.push(U256::from_big_endian(&bytes));
-        }
+    fn run_gt(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push_bool(x > y)
     }
 
-    fn run_eq(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push_bool(x == y);
+    fn run_eq(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push_bool(x == y)
     }
 
-    fn run_iszero(&mut self) {
-        let x = self.pop();
-        self.push_bool(x.is_zero());
+    fn run_iszero(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        self.push_bool(x.is_zero())
     }
 
-    fn run_and(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(x & y);
+    fn run_and(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(x & y)
     }
 
-    fn run_or(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(x | y);
+    fn run_or(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(x | y)
     }
 
-    fn run_xor(&mut self) {
-        let x = self.pop();
-        let y = self.pop();
-        self.push(x ^ y);
+    fn run_xor(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let y = self.pop()?;
+        self.push(x ^ y)
     }
 
-    fn run_not(&mut self) {
-        let x = self.pop();
-        self.push(!x);
+    fn run_not(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        self.push(!x)
     }
 
-    fn run_byte(&mut self) {
-        let i = self.pop();
-        let x = self.pop();
+    fn run_byte(&mut self) -> anyhow::Result<(), ProgramError> {
+        let i = self.pop()?;
+        let x = self.pop()?;
         let result = if i < 32.into() {
             x.byte(31 - i.as_usize())
         } else {
             0
         };
-        self.push(result.into());
+        self.push(result.into())
     }
 
-    fn run_shl(&mut self) {
-        let shift = self.pop();
-        let value = self.pop();
+    fn run_shl(&mut self) -> anyhow::Result<(), ProgramError> {
+        let shift = self.pop()?;
+        let value = self.pop()?;
         self.push(if shift < U256::from(256usize) {
             value << shift
         } else {
             U256::zero()
-        });
+        })
     }
 
-    fn run_shr(&mut self) {
-        let shift = self.pop();
-        let value = self.pop();
-        self.push(value >> shift);
+    fn run_shr(&mut self) -> anyhow::Result<(), ProgramError> {
+        let shift = self.pop()?;
+        let value = self.pop()?;
+        self.push(value >> shift)
     }
 
-    fn run_sar(&mut self) {
-        let shift = self.pop();
-        let value = self.pop();
-        let value_is_neg = !value.eq(&(value & SIGN_MASK));
-
-        if shift < U256::from(256usize) {
-            let shift = shift.low_u64() as usize;
-            let mask = !(MINUS_ONE >> shift);
-            let value_shifted = value >> shift;
-
-            if value_is_neg {
-                self.push(value_shifted | mask);
-            } else {
-                self.push(value_shifted);
-            };
-        } else {
-            self.push(if value_is_neg {
-                MINUS_ONE
-            } else {
-                U256::zero()
-            });
-        }
-    }
+    fn run_keccak_general(&mut self) -> anyhow::Result<(), ProgramError> {
+        let addr = self.pop()?;
+        let (context, segment, offset) = unpack_address!(addr);
 
-    fn run_keccak256(&mut self) {
-        let offset = self.pop().as_usize();
-        let size = self.pop().as_usize();
-        let bytes = (offset..offset + size)
-            .map(|i| {
-                self.generation_state
-                    .memory
-                    .mload_general(self.context, Segment::MainMemory, i)
-                    .byte(0)
-            })
-            .collect::<Vec<_>>();
-        let hash = keccak(bytes);
-        self.push(U256::from_big_endian(hash.as_bytes()));
-    }
-
-    fn run_keccak_general(&mut self) {
-        let context = self.pop().as_usize();
-        let segment = Segment::all()[self.pop().as_usize()];
-        // Not strictly needed but here to avoid surprises with MSIZE.
-        assert_ne!(segment, Segment::MainMemory, "Call KECCAK256 instead.");
-        let offset = self.pop().as_usize();
-        let size = self.pop().as_usize();
+        let size = self.pop()?.as_usize();
         let bytes = (offset..offset + size)
             .map(|i| {
                 self.generation_state
@@ -780,264 +1026,143 @@ impl<'a> Interpreter<'a> {
             .collect::<Vec<_>>();
         println!("Hashing {:?}", &bytes);
         let hash = keccak(bytes);
-        self.push(U256::from_big_endian(hash.as_bytes()));
+        self.push(U256::from_big_endian(hash.as_bytes()))
     }
 
-    fn run_address(&mut self) {
-        self.push(
-            self.generation_state.memory.contexts[self.context].segments
-                [Segment::ContextMetadata as usize]
-                .get(ContextMetadata::Address as usize),
-        )
-    }
-
-    fn run_origin(&mut self) {
-        self.push(self.get_txn_field(NormalizedTxnField::Origin))
-    }
-
-    fn run_caller(&mut self) {
-        self.push(
-            self.generation_state.memory.contexts[self.context].segments
-                [Segment::ContextMetadata as usize]
-                .get(ContextMetadata::Caller as usize),
-        )
-    }
+    fn run_prover_input(&mut self) -> Result<(), ProgramError> {
+        let prover_input_fn = self
+            .prover_inputs_map
+            .get(&(self.generation_state.registers.program_counter - 1))
+            .ok_or(ProgramError::ProverInputError(
+                ProverInputError::InvalidMptInput,
+            ))?;
+        let output = self.generation_state.prover_input(prover_input_fn)?;
+        self.push(output)
+    }
+
+    fn run_pop(&mut self) -> anyhow::Result<(), ProgramError> {
+        self.pop().map(|_| ())
+    }
+
+    fn run_syscall(
+        &mut self,
+        opcode: u8,
+        stack_values_read: usize,
+        stack_len_increased: bool,
+    ) -> Result<(), ProgramError> {
+        TryInto::<u64>::try_into(self.generation_state.registers.gas_used)
+            .map_err(|_| ProgramError::GasLimitError)?;
+        if self.generation_state.registers.stack_len < stack_values_read {
+            return Err(ProgramError::StackUnderflow);
+        }
 
-    fn run_callvalue(&mut self) {
-        self.push(
-            self.generation_state.memory.contexts[self.context].segments
-                [Segment::ContextMetadata as usize]
-                .get(ContextMetadata::CallValue as usize),
-        )
-    }
+        if stack_len_increased
+            && !self.is_kernel()
+            && self.generation_state.registers.stack_len >= MAX_USER_STACK_SIZE
+        {
+            return Err(ProgramError::StackOverflow);
+        };
 
-    fn run_calldataload(&mut self) {
-        let offset = self.pop().as_usize();
-        let value = U256::from_big_endian(
-            &(0..32)
-                .map(|i| {
-                    self.generation_state
-                        .memory
-                        .mload_general(self.context, Segment::Calldata, offset + i)
-                        .byte(0)
-                })
-                .collect::<Vec<_>>(),
-        );
-        self.push(value);
-    }
+        let handler_jumptable_addr = KERNEL.global_labels["syscall_jumptable"];
+        let handler_addr = {
+            let offset = handler_jumptable_addr + (opcode as usize) * (BYTES_PER_OFFSET as usize);
+            self.get_memory_segment(Segment::Code)[offset..offset + 3]
+                .iter()
+                .fold(U256::from(0), |acc, &elt| acc * (1 << 8) + elt)
+        };
 
-    fn run_calldatasize(&mut self) {
-        self.push(
-            self.generation_state.memory.contexts[self.context].segments
-                [Segment::ContextMetadata as usize]
-                .get(ContextMetadata::CalldataSize as usize),
-        )
-    }
+        let new_program_counter =
+            u256_to_usize(handler_addr).map_err(|_| ProgramError::IntegerTooLarge)?;
 
-    fn run_calldatacopy(&mut self) {
-        let dest_offset = self.pop().as_usize();
-        let offset = self.pop().as_usize();
-        let size = self.pop().as_usize();
-        for i in 0..size {
-            let calldata_byte = self.generation_state.memory.mload_general(
-                self.context,
-                Segment::Calldata,
-                offset + i,
-            );
-            self.generation_state.memory.mstore_general(
-                self.context,
-                Segment::MainMemory,
-                dest_offset + i,
-                calldata_byte,
-            );
-        }
-    }
+        let syscall_info = U256::from(self.generation_state.registers.program_counter)
+            + U256::from((self.is_kernel() as usize) << 32)
+            + (U256::from(self.generation_state.registers.gas_used) << 192);
+        self.generation_state.registers.program_counter = new_program_counter;
 
-    fn run_codesize(&mut self) {
-        self.push(
-            self.generation_state.memory.contexts[self.context].segments
-                [Segment::ContextMetadata as usize]
-                .get(ContextMetadata::CodeSize as usize),
-        )
+        self.set_is_kernel(true);
+        self.generation_state.registers.gas_used = 0;
+        self.push(syscall_info)
     }
 
-    fn run_codecopy(&mut self) {
-        let dest_offset = self.pop().as_usize();
-        let offset = self.pop().as_usize();
-        let size = self.pop().as_usize();
-        for i in 0..size {
-            let code_byte =
-                self.generation_state
-                    .memory
-                    .mload_general(self.context, Segment::Code, offset + i);
-            self.generation_state.memory.mstore_general(
-                self.context,
-                Segment::MainMemory,
-                dest_offset + i,
-                code_byte,
-            );
+    fn set_jumpdest_bit(&mut self, x: U256) -> U256 {
+        if self.generation_state.memory.contexts[self.context()].segments
+            [Segment::JumpdestBits.unscale()]
+        .content
+        .len()
+            > x.low_u32() as usize
+        {
+            self.generation_state.memory.get(MemoryAddress {
+                context: self.context(),
+                segment: Segment::JumpdestBits.unscale(),
+                virt: x.low_u32() as usize,
+            })
+        } else {
+            0.into()
         }
     }
+    fn run_jump(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
 
-    fn run_gasprice(&mut self) {
-        self.push(self.get_txn_field(NormalizedTxnField::ComputedFeePerGas))
-    }
+        let jumpdest_bit = self.set_jumpdest_bit(x);
 
-    fn run_returndatasize(&mut self) {
-        self.push(
-            self.generation_state.memory.contexts[self.context].segments
-                [Segment::ContextMetadata as usize]
-                .get(ContextMetadata::ReturndataSize as usize),
-        )
-    }
+        // Check that the destination is valid.
+        let x: u32 = x
+            .try_into()
+            .map_err(|_| ProgramError::InvalidJumpDestination)?;
 
-    fn run_returndatacopy(&mut self) {
-        let dest_offset = self.pop().as_usize();
-        let offset = self.pop().as_usize();
-        let size = self.pop().as_usize();
-        for i in 0..size {
-            let returndata_byte = self.generation_state.memory.mload_general(
-                self.context,
-                Segment::Returndata,
-                offset + i,
-            );
-            self.generation_state.memory.mstore_general(
-                self.context,
-                Segment::MainMemory,
-                dest_offset + i,
-                returndata_byte,
-            );
+        if !self.is_kernel() && jumpdest_bit != U256::one() {
+            return Err(ProgramError::InvalidJumpDestination);
         }
-    }
-
-    fn run_coinbase(&mut self) {
-        self.push(self.get_global_metadata_field(GlobalMetadata::BlockBeneficiary))
-    }
-
-    fn run_timestamp(&mut self) {
-        self.push(self.get_global_metadata_field(GlobalMetadata::BlockTimestamp))
-    }
-
-    fn run_number(&mut self) {
-        self.push(self.get_global_metadata_field(GlobalMetadata::BlockNumber))
-    }
-
-    fn run_difficulty(&mut self) {
-        self.push(self.get_global_metadata_field(GlobalMetadata::BlockDifficulty))
-    }
 
-    fn run_gaslimit(&mut self) {
-        self.push(self.get_global_metadata_field(GlobalMetadata::BlockGasLimit))
+        self.jump_to(x as usize, false)
     }
 
-    fn run_basefee(&mut self) {
-        self.push(self.get_global_metadata_field(GlobalMetadata::BlockBaseFee))
-    }
-
-    fn run_chainid(&mut self) {
-        self.push(self.get_global_metadata_field(GlobalMetadata::BlockChainId))
-    }
-
-    fn run_prover_input(&mut self) -> anyhow::Result<()> {
-        let prover_input_fn = self
-            .prover_inputs_map
-            .get(&(self.generation_state.registers.program_counter - 1))
-            .ok_or_else(|| anyhow!("Offset not in prover inputs."))?;
-        let output = self
-            .generation_state
-            .prover_input(prover_input_fn)
-            .map_err(|_| anyhow!("Invalid prover inputs."))?;
-        self.push(output);
-        Ok(())
-    }
-
-    fn run_blobbasefee(&mut self) {
-        self.push(self.get_global_metadata_field(GlobalMetadata::BlockBlobBaseFee))
-    }
-
-    fn run_pop(&mut self) {
-        self.pop();
-    }
-
-    fn run_mload(&mut self) {
-        let offset = self.pop().as_usize();
-        let value = U256::from_big_endian(
-            &(0..32)
-                .map(|i| {
-                    self.generation_state
-                        .memory
-                        .mload_general(self.context, Segment::MainMemory, offset + i)
-                        .byte(0)
-                })
-                .collect::<Vec<_>>(),
-        );
-        self.push(value);
-    }
-
-    fn run_mstore(&mut self) {
-        let offset = self.pop().as_usize();
-        let value = self.pop();
-        let mut bytes = [0; 32];
-        value.to_big_endian(&mut bytes);
-        for (i, byte) in (0..32).zip(bytes) {
-            self.generation_state.memory.mstore_general(
-                self.context,
-                Segment::MainMemory,
-                offset + i,
-                byte.into(),
-            );
+    fn run_jumpi(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let b = self.pop()?;
+        if !b.is_zero() {
+            let x: u32 = x
+                .try_into()
+                .map_err(|_| ProgramError::InvalidJumpiDestination)?;
+            self.jump_to(x as usize, true)?;
         }
-    }
-
-    fn run_mstore8(&mut self) {
-        let offset = self.pop().as_usize();
-        let value = self.pop();
-        self.generation_state.memory.mstore_general(
-            self.context,
-            Segment::MainMemory,
-            offset,
-            value.byte(0).into(),
-        );
-    }
+        let jumpdest_bit = self.set_jumpdest_bit(x);
 
-    fn run_jump(&mut self) {
-        let x = self.pop().as_usize();
-        self.jump_to(x);
-    }
-
-    fn run_jumpi(&mut self) {
-        let x = self.pop().as_usize();
-        let b = self.pop();
-        if !b.is_zero() {
-            self.jump_to(x);
+        if !b.is_zero() && !self.is_kernel() && jumpdest_bit != U256::one() {
+            return Err(ProgramError::InvalidJumpiDestination);
         }
+        Ok(())
     }
 
-    fn run_pc(&mut self) {
-        self.push((self.generation_state.registers.program_counter - 1).into());
+    fn run_blobbasefee(&mut self) -> anyhow::Result<(), ProgramError> {
+        self.push(self.get_global_metadata_field(GlobalMetadata::BlockBlobBaseFee))
     }
 
-    fn run_msize(&mut self) {
+    fn run_pc(&mut self) -> anyhow::Result<(), ProgramError> {
         self.push(
-            self.generation_state.memory.contexts[self.context].segments
-                [Segment::ContextMetadata as usize]
-                .get(ContextMetadata::MemWords as usize),
+            (self
+                .generation_state
+                .registers
+                .program_counter
+                .saturating_sub(1))
+            .into(),
         )
     }
 
-    fn run_jumpdest(&mut self) {
-        assert!(!self.kernel_mode, "JUMPDEST is not needed in kernel code");
+    fn run_jumpdest(&mut self) -> anyhow::Result<(), ProgramError> {
+        assert!(!self.is_kernel(), "JUMPDEST is not needed in kernel code");
+        Ok(())
     }
 
-    fn run_mcopy(&mut self) {
-        let dest_offset = self.pop().as_usize();
-        let offset = self.pop().as_usize();
-        let size = self.pop().as_usize();
+    fn run_mcopy(&mut self) -> anyhow::Result<(), ProgramError> {
+        let dest_offset = self.pop()?.as_usize();
+        let offset = self.pop()?.as_usize();
+        let size = self.pop()?.as_usize();
 
         let intermediary_memory: Vec<U256> = (0..size)
             .map(|i| {
                 self.generation_state.memory.mload_general(
-                    self.context,
+                    self.context(),
                     Segment::MainMemory,
                     offset + i,
                 )
@@ -1046,76 +1171,121 @@ impl<'a> Interpreter<'a> {
 
         for i in 0..size {
             self.generation_state.memory.mstore_general(
-                self.context,
+                self.context(),
                 Segment::MainMemory,
                 dest_offset + i,
                 intermediary_memory[i],
             );
         }
-    }
 
-    fn jump_to(&mut self, offset: usize) {
-        // The JUMPDEST rule is not enforced in kernel mode.
-        if !self.kernel_mode && self.jumpdests.binary_search(&offset).is_err() {
-            panic!("Destination is not a JUMPDEST.");
-        }
+        Ok(())
+    }
 
+    fn jump_to(&mut self, offset: usize, is_jumpi: bool) -> anyhow::Result<(), ProgramError> {
         self.generation_state.registers.program_counter = offset;
 
+        if offset == KERNEL.global_labels["observe_new_address"] {
+            let tip_u256 = stack_peek(&self.generation_state, 0)?;
+            let tip_h256 = H256::from_uint(&tip_u256);
+            let tip_h160 = H160::from(tip_h256);
+            self.generation_state.observe_address(tip_h160);
+        } else if offset == KERNEL.global_labels["observe_new_contract"] {
+            let tip_u256 = stack_peek(&self.generation_state, 0)?;
+            let tip_h256 = H256::from_uint(&tip_u256);
+            self.generation_state.observe_contract(tip_h256)?;
+        }
+
         if self.halt_offsets.contains(&offset) {
             self.running = false;
         }
+        Ok(())
     }
 
-    fn run_push(&mut self, num_bytes: u8) {
+    fn run_push(&mut self, num_bytes: u8) -> anyhow::Result<(), ProgramError> {
         let x = U256::from_big_endian(&self.code_slice(num_bytes as usize));
         self.incr(num_bytes as usize);
-        self.push(x);
+        self.push(x)
     }
 
-    fn run_dup(&mut self, n: u8) {
-        if n == 0 {
-            self.push(self.stack_top());
-        } else {
-            self.push(stack_peek(&self.generation_state, n as usize - 1).unwrap());
+    fn run_dup(&mut self, n: u8) -> anyhow::Result<(), ProgramError> {
+        let len = self.stack_len();
+        if !self.is_kernel() && len >= MAX_USER_STACK_SIZE {
+            return Err(ProgramError::StackOverflow);
+        }
+        if n as usize > self.stack_len() {
+            return Err(ProgramError::StackUnderflow);
         }
+        self.push(stack_peek(&self.generation_state, n as usize - 1)?)
     }
 
-    fn run_swap(&mut self, n: u8) -> anyhow::Result<()> {
+    fn run_swap(&mut self, n: u8) -> anyhow::Result<(), ProgramError> {
         let len = self.stack_len();
-        ensure!(len > n as usize);
-        let to_swap = stack_peek(&self.generation_state, n as usize).unwrap();
-        self.stack_segment_mut()[len - n as usize - 1] = self.stack_top();
+        if n as usize >= len {
+            return Err(ProgramError::StackUnderflow);
+        }
+        let to_swap = stack_peek(&self.generation_state, n as usize)?;
+        let old_value = self.stack_segment_mut()[len - n as usize - 1];
+
+        self.stack_segment_mut()[len - n as usize - 1] = self.stack_top()?;
+        let mem_write_op = InterpreterMemOpKind::Write(
+            old_value,
+            self.context(),
+            Segment::Stack.unscale(),
+            len - n as usize - 1,
+        );
+        self.memops.push(mem_write_op);
         self.generation_state.registers.stack_top = to_swap;
         Ok(())
     }
 
-    fn run_get_context(&mut self) {
-        self.push(self.context.into());
+    fn run_get_context(&mut self) -> anyhow::Result<(), ProgramError> {
+        self.push(U256::from(self.context()) << CONTEXT_SCALING_FACTOR)
     }
 
-    fn run_set_context(&mut self) {
-        let x = self.pop();
-        self.context = x.as_usize();
+    fn run_set_context(&mut self) -> anyhow::Result<(), ProgramError> {
+        let x = self.pop()?;
+        let new_ctx = (x >> CONTEXT_SCALING_FACTOR).as_usize();
+        let sp_to_save = self.stack_len().into();
+
+        let old_ctx = self.context();
+
+        let sp_field = ContextMetadata::StackSize.unscale();
+
+        let old_sp_addr = MemoryAddress::new(old_ctx, Segment::ContextMetadata, sp_field);
+        let new_sp_addr = MemoryAddress::new(new_ctx, Segment::ContextMetadata, sp_field);
+        self.generation_state.memory.set(old_sp_addr, sp_to_save);
+
+        let new_sp = self.generation_state.memory.get(new_sp_addr).as_usize();
+
+        if new_sp > 0 {
+            let new_stack_top = self.generation_state.memory.contexts[new_ctx].segments
+                [Segment::Stack.unscale()]
+            .content[new_sp - 1];
+            self.generation_state.registers.stack_top = new_stack_top;
+        }
+        self.set_context(new_ctx);
+        self.generation_state.registers.stack_len = new_sp;
+        Ok(())
     }
 
-    fn run_mload_general(&mut self) {
-        let context = self.pop().as_usize();
-        let segment = Segment::all()[self.pop().as_usize()];
-        let offset = self.pop().as_usize();
+    fn run_mload_general(&mut self) -> anyhow::Result<(), ProgramError> {
+        let addr = self.pop()?;
+        let (context, segment, offset) = unpack_address!(addr);
         let value = self
             .generation_state
             .memory
             .mload_general(context, segment, offset);
         assert!(value.bits() <= segment.bit_range());
-        self.push(value);
+        self.push(value)
     }
 
-    fn run_mload_32bytes(&mut self) {
-        let context = self.pop().as_usize();
-        let segment = Segment::all()[self.pop().as_usize()];
-        let offset = self.pop().as_usize();
-        let len = self.pop().as_usize();
+    fn run_mload_32bytes(&mut self) -> anyhow::Result<(), ProgramError> {
+        let addr = self.pop()?;
+        let (context, segment, offset) = unpack_address!(addr);
+        let len = self.pop()?.as_usize();
+        if len > 32 {
+            return Err(ProgramError::IntegerTooLarge);
+        }
         let bytes: Vec<u8> = (0..len)
             .map(|i| {
                 self.generation_state
@@ -1125,125 +1295,134 @@ impl<'a> Interpreter<'a> {
             })
             .collect();
         let value = U256::from_big_endian(&bytes);
-        self.push(value);
+        self.push(value)
     }
 
-    fn run_mstore_general(&mut self) {
-        let context = self.pop().as_usize();
-        let segment = Segment::all()[self.pop().as_usize()];
-        let offset = self.pop().as_usize();
-        let value = self.pop();
-        self.generation_state
+    fn run_mstore_general(&mut self) -> anyhow::Result<(), ProgramError> {
+        let value = self.pop()?;
+        let addr = self.pop()?;
+        let (context, segment, offset) = unpack_address!(addr);
+        let memop = self
+            .generation_state
             .memory
             .mstore_general(context, segment, offset, value);
+        self.memops.push(memop);
+        Ok(())
     }
 
-    fn run_mstore_32bytes(&mut self) {
-        let context = self.pop().as_usize();
-        let segment = Segment::all()[self.pop().as_usize()];
-        let offset = self.pop().as_usize();
-        let value = self.pop();
-        let len = self.pop().as_usize();
+    fn run_mstore_32bytes(&mut self, n: u8) -> anyhow::Result<(), ProgramError> {
+        let addr = self.pop()?;
+        let (context, segment, offset) = unpack_address!(addr);
+        let value = self.pop()?;
 
         let mut bytes = vec![0; 32];
         value.to_little_endian(&mut bytes);
-        bytes.resize(len, 0);
+        bytes.resize(n as usize, 0);
         bytes.reverse();
 
         for (i, &byte) in bytes.iter().enumerate() {
-            self.generation_state
-                .memory
-                .mstore_general(context, segment, offset + i, byte.into());
+            let memop = self.generation_state.memory.mstore_general(
+                context,
+                segment,
+                offset + i,
+                byte.into(),
+            );
+            self.memops.push(memop);
         }
-    }
 
-    pub(crate) fn stack_len(&self) -> usize {
-        self.generation_state.registers.stack_len
+        self.push(addr + U256::from(n))
     }
 
-    pub(crate) fn stack_top(&self) -> U256 {
-        self.generation_state.registers.stack_top
+    fn run_exit_kernel(&mut self) -> anyhow::Result<(), ProgramError> {
+        let kexit_info = self.pop()?;
+
+        let kexit_info_u64 = kexit_info.0[0];
+        let program_counter = kexit_info_u64 as u32 as usize;
+        let is_kernel_mode_val = (kexit_info_u64 >> 32) as u32;
+        assert!(is_kernel_mode_val == 0 || is_kernel_mode_val == 1);
+        let is_kernel_mode = is_kernel_mode_val != 0;
+        let gas_used_val = kexit_info.0[3];
+        TryInto::<u64>::try_into(gas_used_val).map_err(|_| ProgramError::GasLimitError)?;
+
+        self.generation_state.registers.program_counter = program_counter;
+        self.set_is_kernel(is_kernel_mode);
+        self.generation_state.registers.gas_used = gas_used_val;
+
+        Ok(())
     }
-}
 
-// Computes the two's complement of the given integer.
-fn two_complement(x: U256) -> U256 {
-    let flipped_bits = x ^ MINUS_ONE;
-    flipped_bits.overflowing_add(U256::one()).0
-}
+    fn run_exception(&mut self, exc_code: u8) -> Result<(), ProgramError> {
+        let disallowed_len = MAX_USER_STACK_SIZE + 1;
+
+        if self.stack_len() == disallowed_len {
+            // This is a stack overflow that should have been caught earlier.
+            return Err(ProgramError::StackOverflow);
+        };
 
-fn signed_cmp(x: U256, y: U256) -> Ordering {
-    let x_is_zero = x.is_zero();
-    let y_is_zero = y.is_zero();
+        let handler_jumptable_addr = KERNEL.global_labels["exception_jumptable"];
+        let handler_addr = {
+            let offset = handler_jumptable_addr + (exc_code as usize) * (BYTES_PER_OFFSET as usize);
+            assert_eq!(BYTES_PER_OFFSET, 3, "Code below assumes 3 bytes per offset");
+            self.get_memory_segment(Segment::Code)[offset..offset + 3]
+                .iter()
+                .fold(U256::from(0), |acc, &elt| acc * 256 + elt)
+        };
+
+        let new_program_counter = u256_to_usize(handler_addr)?;
+
+        let exc_info = U256::from(self.generation_state.registers.program_counter)
+            + (U256::from(self.generation_state.registers.gas_used) << 192);
+
+        self.push(exc_info)?;
+
+        // Set registers before pushing to the stack; in particular, we need to set kernel mode so we
+        // can't incorrectly trigger a stack overflow. However, note that we have to do it _after_ we
+        // make `exc_info`, which should contain the old values.
+        self.generation_state.registers.program_counter = new_program_counter;
+        self.set_is_kernel(true);
+        self.generation_state.registers.gas_used = 0;
 
-    if x_is_zero && y_is_zero {
-        return Ordering::Equal;
+        Ok(())
     }
 
-    let x_is_pos = x.eq(&(x & SIGN_MASK));
-    let y_is_pos = y.eq(&(y & SIGN_MASK));
+    pub(crate) const fn stack_len(&self) -> usize {
+        self.generation_state.registers.stack_len
+    }
 
-    if x_is_zero {
-        if y_is_pos {
-            return Ordering::Less;
+    pub(crate) fn stack_top(&self) -> anyhow::Result<U256, ProgramError> {
+        if self.stack_len() > 0 {
+            Ok(self.generation_state.registers.stack_top)
         } else {
-            return Ordering::Greater;
+            Err(ProgramError::StackUnderflow)
         }
-    };
+    }
 
-    if y_is_zero {
-        if x_is_pos {
-            return Ordering::Greater;
-        } else {
-            return Ordering::Less;
-        }
-    };
+    pub(crate) const fn is_kernel(&self) -> bool {
+        self.generation_state.registers.is_kernel
+    }
 
-    match (x_is_pos, y_is_pos) {
-        (true, true) => x.cmp(&y),
-        (true, false) => Ordering::Greater,
-        (false, true) => Ordering::Less,
-        (false, false) => x.cmp(&y).reverse(),
+    pub(crate) fn set_is_kernel(&mut self, is_kernel: bool) {
+        self.generation_state.registers.is_kernel = is_kernel
     }
-}
 
-/// -1 in two's complement representation consists in all bits set to 1.
-const MINUS_ONE: U256 = U256([
-    0xffffffffffffffff,
-    0xffffffffffffffff,
-    0xffffffffffffffff,
-    0xffffffffffffffff,
-]);
-
-/// -2^255 in two's complement representation consists in the MSB set to 1.
-const MIN_VALUE: U256 = U256([
-    0x0000000000000000,
-    0x0000000000000000,
-    0x0000000000000000,
-    0x8000000000000000,
-]);
-
-const SIGN_MASK: U256 = U256([
-    0xffffffffffffffff,
-    0xffffffffffffffff,
-    0xffffffffffffffff,
-    0x7fffffffffffffff,
-]);
-
-/// Return the (ordered) JUMPDEST offsets in the code.
-fn find_jumpdests(code: &[u8]) -> Vec<usize> {
-    let mut offset = 0;
-    let mut res = Vec::new();
-    while offset < code.len() {
-        let opcode = code[offset];
-        match opcode {
-            0x5b => res.push(offset),
-            x if (0x60..0x80).contains(&x) => offset += x as usize - 0x5f, // PUSH instruction, disregard data.
-            _ => (),
+    pub(crate) const fn context(&self) -> usize {
+        self.generation_state.registers.context
+    }
+
+    pub(crate) fn set_context(&mut self, context: usize) {
+        if context == 0 {
+            assert!(self.is_kernel());
         }
-        offset += 1;
+        self.generation_state.registers.context = context;
+    }
+
+    /// Writes the encoding of 0 to position @ENCODED_EMPTY_NODE_POS.
+    pub(crate) fn initialize_rlp_segment(&mut self) {
+        self.generation_state.memory.set(
+            MemoryAddress::new(0, Segment::RlpRaw, 0xFFFFFFFF),
+            128.into(),
+        )
     }
-    res
 }
 
 fn get_mnemonic(opcode: u8) -> &'static str {
@@ -1389,7 +1568,38 @@ fn get_mnemonic(opcode: u8) -> &'static str {
         0xa3 => "LOG3",
         0xa4 => "LOG4",
         0xa5 => "PANIC",
-        0xee => "MSTORE_32BYTES",
+        0xc0 => "MSTORE_32BYTES_1",
+        0xc1 => "MSTORE_32BYTES_2",
+        0xc2 => "MSTORE_32BYTES_3",
+        0xc3 => "MSTORE_32BYTES_4",
+        0xc4 => "MSTORE_32BYTES_5",
+        0xc5 => "MSTORE_32BYTES_6",
+        0xc6 => "MSTORE_32BYTES_7",
+        0xc7 => "MSTORE_32BYTES_8",
+        0xc8 => "MSTORE_32BYTES_9",
+        0xc9 => "MSTORE_32BYTES_10",
+        0xca => "MSTORE_32BYTES_11",
+        0xcb => "MSTORE_32BYTES_12",
+        0xcc => "MSTORE_32BYTES_13",
+        0xcd => "MSTORE_32BYTES_14",
+        0xce => "MSTORE_32BYTES_15",
+        0xcf => "MSTORE_32BYTES_16",
+        0xd0 => "MSTORE_32BYTES_17",
+        0xd1 => "MSTORE_32BYTES_18",
+        0xd2 => "MSTORE_32BYTES_19",
+        0xd3 => "MSTORE_32BYTES_20",
+        0xd4 => "MSTORE_32BYTES_21",
+        0xd5 => "MSTORE_32BYTES_22",
+        0xd6 => "MSTORE_32BYTES_23",
+        0xd7 => "MSTORE_32BYTES_24",
+        0xd8 => "MSTORE_32BYTES_25",
+        0xd9 => "MSTORE_32BYTES_26",
+        0xda => "MSTORE_32BYTES_27",
+        0xdb => "MSTORE_32BYTES_28",
+        0xdc => "MSTORE_32BYTES_29",
+        0xdd => "MSTORE_32BYTES_30",
+        0xde => "MSTORE_32BYTES_31",
+        0xdf => "MSTORE_32BYTES_32",
         0xf0 => "CREATE",
         0xf1 => "CALL",
         0xf2 => "CALLCODE",
@@ -1410,11 +1620,19 @@ fn get_mnemonic(opcode: u8) -> &'static str {
     }
 }
 
+macro_rules! unpack_address {
+    ($addr:ident) => {{
+        let offset = $addr.low_u32() as usize;
+        let segment = Segment::all()[($addr >> SEGMENT_SCALING_FACTOR).low_u32() as usize];
+        let context = ($addr >> CONTEXT_SCALING_FACTOR).low_u32() as usize;
+        (context, segment, offset)
+    }};
+}
+pub(crate) use unpack_address;
+
 #[cfg(test)]
 mod tests {
-    use std::collections::HashMap;
-
-    use crate::cpu::kernel::interpreter::run;
+    use super::*;
     use crate::memory::segments::Segment;
 
     #[test]
@@ -1444,20 +1662,52 @@ mod tests {
         //         PUSH1 0x42
         //         PUSH1 0x27
         //         MSTORE8
-        let code = vec![
+        let code = [
             0x60, 0xff, 0x60, 0x0, 0x52, 0x60, 0, 0x51, 0x60, 0x1, 0x51, 0x60, 0x42, 0x60, 0x27,
             0x53,
         ];
-        let pis = HashMap::new();
-        let run = run(&code, 0, vec![], &pis)?;
-        assert_eq!(run.stack(), &[0xff.into(), 0xff00.into()]);
+        let mut interpreter = Interpreter::new_with_kernel(0, vec![]);
+
+        interpreter.set_code(1, code.to_vec());
+
+        interpreter.generation_state.memory.contexts[1].segments
+            [Segment::ContextMetadata.unscale()]
+        .set(ContextMetadata::GasLimit.unscale(), 100_000.into());
+        // Set context and kernel mode.
+        interpreter.set_context(1);
+        interpreter.set_is_kernel(false);
+        // Set memory necessary to sys_stop.
+        interpreter.generation_state.memory.set(
+            MemoryAddress::new(
+                1,
+                Segment::ContextMetadata,
+                ContextMetadata::ParentProgramCounter.unscale(),
+            ),
+            0xdeadbeefu32.into(),
+        );
+        interpreter.generation_state.memory.set(
+            MemoryAddress::new(
+                1,
+                Segment::ContextMetadata,
+                ContextMetadata::ParentContext.unscale(),
+            ),
+            U256::one() << CONTEXT_SCALING_FACTOR,
+        );
+
+        interpreter.run()?;
+
+        // sys_stop returns `success` and `cum_gas_used`, that we need to pop.
+        interpreter.pop().expect("Stack should not be empty");
+        interpreter.pop().expect("Stack should not be empty");
+
+        assert_eq!(interpreter.stack(), &[0xff.into(), 0xff00.into()]);
         assert_eq!(
-            run.generation_state.memory.contexts[0].segments[Segment::MainMemory as usize]
+            interpreter.generation_state.memory.contexts[1].segments[Segment::MainMemory.unscale()]
                 .get(0x27),
             0x42.into()
         );
         assert_eq!(
-            run.generation_state.memory.contexts[0].segments[Segment::MainMemory as usize]
+            interpreter.generation_state.memory.contexts[1].segments[Segment::MainMemory.unscale()]
                 .get(0x1f),
             0xff.into()
         );
diff --git a/evm/src/cpu/kernel/opcodes.rs b/evm/src/cpu/kernel/opcodes.rs
index 7f11765932..503f4182e2 100644
--- a/evm/src/cpu/kernel/opcodes.rs
+++ b/evm/src/cpu/kernel/opcodes.rs
@@ -115,7 +115,38 @@ pub fn get_opcode(mnemonic: &str) -> u8 {
         "LOG3" => 0xa3,
         "LOG4" => 0xa4,
         "PANIC" => 0xa5,
-        "MSTORE_32BYTES" => 0xee,
+        "MSTORE_32BYTES_1" => 0xc0,
+        "MSTORE_32BYTES_2" => 0xc1,
+        "MSTORE_32BYTES_3" => 0xc2,
+        "MSTORE_32BYTES_4" => 0xc3,
+        "MSTORE_32BYTES_5" => 0xc4,
+        "MSTORE_32BYTES_6" => 0xc5,
+        "MSTORE_32BYTES_7" => 0xc6,
+        "MSTORE_32BYTES_8" => 0xc7,
+        "MSTORE_32BYTES_9" => 0xc8,
+        "MSTORE_32BYTES_10" => 0xc9,
+        "MSTORE_32BYTES_11" => 0xca,
+        "MSTORE_32BYTES_12" => 0xcb,
+        "MSTORE_32BYTES_13" => 0xcc,
+        "MSTORE_32BYTES_14" => 0xcd,
+        "MSTORE_32BYTES_15" => 0xce,
+        "MSTORE_32BYTES_16" => 0xcf,
+        "MSTORE_32BYTES_17" => 0xd0,
+        "MSTORE_32BYTES_18" => 0xd1,
+        "MSTORE_32BYTES_19" => 0xd2,
+        "MSTORE_32BYTES_20" => 0xd3,
+        "MSTORE_32BYTES_21" => 0xd4,
+        "MSTORE_32BYTES_22" => 0xd5,
+        "MSTORE_32BYTES_23" => 0xd6,
+        "MSTORE_32BYTES_24" => 0xd7,
+        "MSTORE_32BYTES_25" => 0xd8,
+        "MSTORE_32BYTES_26" => 0xd9,
+        "MSTORE_32BYTES_27" => 0xda,
+        "MSTORE_32BYTES_28" => 0xdb,
+        "MSTORE_32BYTES_29" => 0xdc,
+        "MSTORE_32BYTES_30" => 0xdd,
+        "MSTORE_32BYTES_31" => 0xde,
+        "MSTORE_32BYTES_32" => 0xdf,
         "CREATE" => 0xf0,
         "CALL" => 0xf1,
         "CALLCODE" => 0xf2,
diff --git a/evm/src/cpu/kernel/parser.rs b/evm/src/cpu/kernel/parser.rs
index 49181b716f..7864acfe0e 100644
--- a/evm/src/cpu/kernel/parser.rs
+++ b/evm/src/cpu/kernel/parser.rs
@@ -4,13 +4,13 @@ use ethereum_types::U256;
 use pest::iterators::Pair;
 use pest::Parser;
 
-use super::ast::StackPlaceholder;
+use super::ast::{BytesTarget, StackPlaceholder};
 use crate::cpu::kernel::ast::{File, Item, PushTarget, StackReplacement};
 
 /// Parses EVM assembly code.
 #[derive(pest_derive::Parser)]
 #[grammar = "cpu/kernel/evm_asm.pest"]
-pub struct AsmParser;
+struct AsmParser;
 
 pub(crate) fn parse(s: &str) -> File {
     let file = AsmParser::parse(Rule::file, s)
@@ -38,7 +38,7 @@ fn parse_item(item: Pair<Rule>) -> Item {
         Rule::macro_label_decl => {
             Item::MacroLabelDeclaration(item.into_inner().next().unwrap().as_str().into())
         }
-        Rule::bytes_item => Item::Bytes(item.into_inner().map(parse_literal_u8).collect()),
+        Rule::bytes_item => Item::Bytes(item.into_inner().map(parse_bytes_target).collect()),
         Rule::jumptable_item => {
             Item::Jumptable(item.into_inner().map(|i| i.as_str().into()).collect())
         }
@@ -167,6 +167,16 @@ fn parse_push_target(target: Pair<Rule>) -> PushTarget {
     }
 }
 
+fn parse_bytes_target(target: Pair<Rule>) -> BytesTarget {
+    assert_eq!(target.as_rule(), Rule::bytes_target);
+    let inner = target.into_inner().next().unwrap();
+    match inner.as_rule() {
+        Rule::literal => BytesTarget::Literal(parse_literal_u8(inner)),
+        Rule::constant => BytesTarget::Constant(inner.into_inner().next().unwrap().as_str().into()),
+        _ => panic!("Unexpected {:?}", inner.as_rule()),
+    }
+}
+
 fn parse_literal_u8(literal: Pair<Rule>) -> u8 {
     let literal = literal.into_inner().next().unwrap();
     match literal.as_rule() {
diff --git a/evm/src/cpu/kernel/stack/permutations.rs b/evm/src/cpu/kernel/stack/permutations.rs
index d64755ede6..71304edd0c 100644
--- a/evm/src/cpu/kernel/stack/permutations.rs
+++ b/evm/src/cpu/kernel/stack/permutations.rs
@@ -19,8 +19,8 @@
 //!
 //! We typically represent a `(0 i)` transposition as a single scalar `i`.
 
+use core::hash::Hash;
 use std::collections::{HashMap, HashSet};
-use std::hash::Hash;
 
 use crate::cpu::kernel::stack::stack_manipulation::{StackItem, StackOp};
 
@@ -58,8 +58,8 @@ fn apply_perm<T: Eq + Hash + Clone>(permutation: Vec<Vec<usize>>, mut lst: Vec<T
 
 /// This function does STEP 1.
 /// Given 2 lists A, B find a permutation P such that P . A = B.
-pub fn find_permutation<T: Eq + Hash + Clone>(lst_a: &[T], lst_b: &[T]) -> Vec<Vec<usize>> {
-    // We should check to ensure that A and B are indeed rearrangments of each other.
+pub(crate) fn find_permutation<T: Eq + Hash + Clone>(lst_a: &[T], lst_b: &[T]) -> Vec<Vec<usize>> {
+    // We should check to ensure that A and B are indeed rearrangements of each other.
     assert!(is_permutation(lst_a, lst_b));
 
     let n = lst_a.len();
@@ -210,7 +210,7 @@ fn transpositions_to_stack_ops(trans: Vec<usize>) -> Vec<StackOp> {
     trans.into_iter().map(|i| StackOp::Swap(i as u8)).collect()
 }
 
-pub fn is_permutation<T: Eq + Hash + Clone>(a: &[T], b: &[T]) -> bool {
+pub(crate) fn is_permutation<T: Eq + Hash + Clone>(a: &[T], b: &[T]) -> bool {
     make_multiset(a) == make_multiset(b)
 }
 
diff --git a/evm/src/cpu/kernel/stack/stack_manipulation.rs b/evm/src/cpu/kernel/stack/stack_manipulation.rs
index 47b02cf692..a7b376c5ea 100644
--- a/evm/src/cpu/kernel/stack/stack_manipulation.rs
+++ b/evm/src/cpu/kernel/stack/stack_manipulation.rs
@@ -1,7 +1,7 @@
-use std::cmp::Ordering;
+use core::cmp::Ordering;
+use core::hash::Hash;
 use std::collections::hash_map::Entry::{Occupied, Vacant};
 use std::collections::{BinaryHeap, HashMap};
-use std::hash::Hash;
 
 use itertools::Itertools;
 
@@ -286,15 +286,15 @@ impl StackOp {
                         panic!("Target should have been expanded already: {target:?}")
                     }
                 };
-                // This is just a rough estimate; we can update it after implementing PUSH.
-                (bytes, bytes)
+                // A PUSH takes one cycle, and 1 memory read per byte.
+                (1, bytes + 1)
             }
-            // A POP takes one cycle, and doesn't involve memory, it just decrements a pointer.
-            Pop => (1, 0),
+            // A POP takes one cycle, and most of the time a read to update the top of the stack.
+            Pop => (1, 1),
             // A DUP takes one cycle, and a read and a write.
             StackOp::Dup(_) => (1, 2),
-            // A SWAP takes one cycle with four memory ops, to read both values then write to them.
-            StackOp::Swap(_) => (1, 4),
+            // A SWAP takes one cycle with three memory ops, to read both values then write to them.
+            StackOp::Swap(_) => (1, 3),
         };
 
         let cpu_cost = cpu_rows * NUM_CPU_COLUMNS as u32;
diff --git a/evm/src/cpu/kernel/tests/account_code.rs b/evm/src/cpu/kernel/tests/account_code.rs
index f4c18fe603..5e2dddca9e 100644
--- a/evm/src/cpu/kernel/tests/account_code.rs
+++ b/evm/src/cpu/kernel/tests/account_code.rs
@@ -1,19 +1,57 @@
 use std::collections::HashMap;
 
-use anyhow::{anyhow, Result};
+use anyhow::Result;
+use eth_trie_utils::nibbles::Nibbles;
 use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
 use ethereum_types::{Address, BigEndianHash, H256, U256};
+use hex_literal::hex;
 use keccak_hash::keccak;
 use rand::{thread_rng, Rng};
 
 use crate::cpu::kernel::aggregator::KERNEL;
+use crate::cpu::kernel::constants::context_metadata::ContextMetadata::{self, GasLimit};
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cpu::kernel::interpreter::Interpreter;
 use crate::cpu::kernel::tests::mpt::nibbles_64;
-use crate::generation::mpt::{all_mpt_prover_inputs_reversed, AccountRlp};
+use crate::generation::mpt::{load_all_mpts, AccountRlp};
+use crate::generation::TrieInputs;
 use crate::memory::segments::Segment;
+use crate::witness::memory::MemoryAddress;
+use crate::witness::operation::CONTEXT_SCALING_FACTOR;
 use crate::Node;
 
+pub(crate) fn initialize_mpts(interpreter: &mut Interpreter, trie_inputs: &TrieInputs) {
+    // Load all MPTs.
+    let (trie_root_ptrs, trie_data) =
+        load_all_mpts(trie_inputs).expect("Invalid MPT data for preinitialization");
+
+    let state_addr =
+        MemoryAddress::new_bundle((GlobalMetadata::StateTrieRoot as usize).into()).unwrap();
+    let txn_addr =
+        MemoryAddress::new_bundle((GlobalMetadata::TransactionTrieRoot as usize).into()).unwrap();
+    let receipts_addr =
+        MemoryAddress::new_bundle((GlobalMetadata::ReceiptTrieRoot as usize).into()).unwrap();
+    let len_addr =
+        MemoryAddress::new_bundle((GlobalMetadata::TrieDataSize as usize).into()).unwrap();
+
+    let to_set = [
+        (state_addr, trie_root_ptrs.state_root_ptr.into()),
+        (txn_addr, trie_root_ptrs.txn_root_ptr.into()),
+        (receipts_addr, trie_root_ptrs.receipt_root_ptr.into()),
+        (len_addr, trie_data.len().into()),
+    ];
+
+    interpreter.set_memory_multi_addresses(&to_set);
+
+    for (i, data) in trie_data.iter().enumerate() {
+        let trie_addr = MemoryAddress::new(0, Segment::TrieData, i);
+        interpreter
+            .generation_state
+            .memory
+            .set(trie_addr, data.into());
+    }
+}
+
 // Test account with a given code hash.
 fn test_account(code: &[u8]) -> AccountRlp {
     AccountRlp {
@@ -37,20 +75,12 @@ fn prepare_interpreter(
     address: Address,
     account: &AccountRlp,
 ) -> Result<()> {
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
     let mpt_insert_state_trie = KERNEL.global_labels["mpt_insert_state_trie"];
     let mpt_hash_state_trie = KERNEL.global_labels["mpt_hash_state_trie"];
     let mut state_trie: HashedPartialTrie = Default::default();
     let trie_inputs = Default::default();
 
-    interpreter.generation_state.registers.program_counter = load_all_mpts;
-    interpreter.push(0xDEADBEEFu32.into());
-
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![]);
+    initialize_mpts(interpreter, &trie_inputs);
 
     let k = nibbles_64(U256::from_big_endian(
         keccak(address.to_fixed_bytes()).as_bytes(),
@@ -73,9 +103,15 @@ fn prepare_interpreter(
     trie_data.push(account.code_hash.into_uint());
     let trie_data_len = trie_data.len().into();
     interpreter.set_global_metadata_field(GlobalMetadata::TrieDataSize, trie_data_len);
-    interpreter.push(0xDEADBEEFu32.into());
-    interpreter.push(value_ptr.into()); // value_ptr
-    interpreter.push(k.try_into_u256().unwrap()); // key
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(value_ptr.into())
+        .expect("The stack should not overflow"); // value_ptr
+    interpreter
+        .push(k.try_into_u256().unwrap())
+        .expect("The stack should not overflow"); // key
 
     interpreter.run()?;
     assert_eq!(
@@ -87,16 +123,21 @@ fn prepare_interpreter(
 
     // Now, execute mpt_hash_state_trie.
     interpreter.generation_state.registers.program_counter = mpt_hash_state_trie;
-    interpreter.push(0xDEADBEEFu32.into());
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(1.into()) // Initial length of the trie data segment, unused.
+        .expect("The stack should not overflow");
     interpreter.run()?;
 
     assert_eq!(
         interpreter.stack().len(),
-        1,
-        "Expected 1 item on stack after hashing, found {:?}",
+        2,
+        "Expected 2 items on stack after hashing, found {:?}",
         interpreter.stack()
     );
-    let hash = H256::from_uint(&interpreter.stack()[0]);
+    let hash = H256::from_uint(&interpreter.stack()[1]);
 
     state_trie.insert(k, rlp::encode(account).to_vec());
     let expected_state_trie_hash = state_trie.hash();
@@ -119,10 +160,15 @@ fn test_extcodesize() -> Result<()> {
 
     // Test `extcodesize`
     interpreter.generation_state.registers.program_counter = extcodesize;
-    interpreter.pop();
+    interpreter.pop().expect("The stack should not be empty");
+    interpreter.pop().expect("The stack should not be empty");
     assert!(interpreter.stack().is_empty());
-    interpreter.push(0xDEADBEEFu32.into());
-    interpreter.push(U256::from_big_endian(address.as_bytes()));
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(U256::from_big_endian(address.as_bytes()))
+        .expect("The stack should not overflow");
     interpreter.generation_state.inputs.contract_code =
         HashMap::from([(keccak(&code), code.clone())]);
     interpreter.run()?;
@@ -142,17 +188,22 @@ fn test_extcodecopy() -> Result<()> {
     // Prepare the interpreter by inserting the account in the state trie.
     prepare_interpreter(&mut interpreter, address, &account)?;
 
-    let extcodecopy = KERNEL.global_labels["extcodecopy"];
+    let context = interpreter.context();
+    interpreter.generation_state.memory.contexts[context].segments
+        [Segment::ContextMetadata.unscale()]
+    .set(GasLimit.unscale(), U256::from(1000000000000u64));
+
+    let extcodecopy = KERNEL.global_labels["sys_extcodecopy"];
 
     // Put random data in main memory and the `KernelAccountCode` segment for realism.
     let mut rng = thread_rng();
     for i in 0..2000 {
-        interpreter.generation_state.memory.contexts[interpreter.context].segments
-            [Segment::MainMemory as usize]
-            .set(i, U256::from(rng.gen::<u8>()));
-        interpreter.generation_state.memory.contexts[interpreter.context].segments
-            [Segment::KernelAccountCode as usize]
-            .set(i, U256::from(rng.gen::<u8>()));
+        interpreter.generation_state.memory.contexts[context].segments
+            [Segment::MainMemory.unscale()]
+        .set(i, U256::from(rng.gen::<u8>()));
+        interpreter.generation_state.memory.contexts[context].segments
+            [Segment::KernelAccountCode.unscale()]
+        .set(i, U256::from(rng.gen::<u8>()));
     }
 
     // Random inputs
@@ -162,13 +213,24 @@ fn test_extcodecopy() -> Result<()> {
 
     // Test `extcodecopy`
     interpreter.generation_state.registers.program_counter = extcodecopy;
-    interpreter.pop();
+    interpreter.pop().expect("The stack should not be empty");
+    interpreter.pop().expect("The stack should not be empty");
     assert!(interpreter.stack().is_empty());
-    interpreter.push(0xDEADBEEFu32.into());
-    interpreter.push(size.into());
-    interpreter.push(offset.into());
-    interpreter.push(dest_offset.into());
-    interpreter.push(U256::from_big_endian(address.as_bytes()));
+    interpreter
+        .push(size.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(offset.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(dest_offset.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(U256::from_big_endian(address.as_bytes()))
+        .expect("The stack should not overflow");
+    interpreter
+        .push((0xDEADBEEFu64 + (1 << 32)).into())
+        .expect("The stack should not overflow"); // kexit_info
     interpreter.generation_state.inputs.contract_code =
         HashMap::from([(keccak(&code), code.clone())]);
     interpreter.run()?;
@@ -176,9 +238,9 @@ fn test_extcodecopy() -> Result<()> {
     assert!(interpreter.stack().is_empty());
     // Check that the code was correctly copied to memory.
     for i in 0..size {
-        let memory = interpreter.generation_state.memory.contexts[interpreter.context].segments
-            [Segment::MainMemory as usize]
-            .get(dest_offset + i);
+        let memory = interpreter.generation_state.memory.contexts[context].segments
+            [Segment::MainMemory.unscale()]
+        .get(dest_offset + i);
         assert_eq!(
             memory,
             code.get(offset + i).copied().unwrap_or_default().into()
@@ -187,3 +249,221 @@ fn test_extcodecopy() -> Result<()> {
 
     Ok(())
 }
+
+/// Prepare the interpreter for storage tests by inserting all necessary accounts
+/// in the state trie, adding the code we want to context 1 and switching the context.
+fn prepare_interpreter_all_accounts(
+    interpreter: &mut Interpreter,
+    trie_inputs: TrieInputs,
+    addr: [u8; 20],
+    code: &[u8],
+) -> Result<()> {
+    // Load all MPTs.
+    initialize_mpts(interpreter, &trie_inputs);
+    assert_eq!(interpreter.stack(), vec![]);
+
+    // Switch context and initialize memory with the data we need for the tests.
+    interpreter.generation_state.registers.program_counter = 0;
+    interpreter.set_code(1, code.to_vec());
+    interpreter.set_context_metadata_field(
+        1,
+        ContextMetadata::Address,
+        U256::from_big_endian(&addr),
+    );
+    interpreter.set_context_metadata_field(1, ContextMetadata::GasLimit, 100_000.into());
+    interpreter.set_context(1);
+    interpreter.set_is_kernel(false);
+    interpreter.set_context_metadata_field(
+        1,
+        ContextMetadata::ParentProgramCounter,
+        0xdeadbeefu32.into(),
+    );
+    interpreter.set_context_metadata_field(
+        1,
+        ContextMetadata::ParentContext,
+        U256::one() << CONTEXT_SCALING_FACTOR, // ctx = 1
+    );
+
+    Ok(())
+}
+
+/// Tests an SSTORE within a code similar to the contract code in add11_yml.
+#[test]
+fn sstore() -> Result<()> {
+    // We take the same `to` account as in add11_yml.
+    let addr = hex!("095e7baea6a6c7c4c2dfeb977efac326af552d87");
+
+    let addr_hashed = keccak(addr);
+
+    let addr_nibbles = Nibbles::from_bytes_be(addr_hashed.as_bytes()).unwrap();
+
+    let code = [0x60, 0x01, 0x60, 0x01, 0x01, 0x60, 0x00, 0x55, 0x00];
+    let code_hash = keccak(code);
+
+    let account_before = AccountRlp {
+        balance: 0x0de0b6b3a7640000u64.into(),
+        code_hash,
+        ..AccountRlp::default()
+    };
+
+    let mut state_trie_before = HashedPartialTrie::from(Node::Empty);
+
+    state_trie_before.insert(addr_nibbles, rlp::encode(&account_before).to_vec());
+
+    let trie_inputs = TrieInputs {
+        state_trie: state_trie_before.clone(),
+        transactions_trie: Node::Empty.into(),
+        receipts_trie: Node::Empty.into(),
+        storage_tries: vec![(addr_hashed, Node::Empty.into())],
+    };
+
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+
+    // Prepare the interpreter by inserting the account in the state trie.
+    prepare_interpreter_all_accounts(&mut interpreter, trie_inputs, addr, &code)?;
+
+    interpreter.run()?;
+
+    // The first two elements in the stack are `success` and `leftover_gas`,
+    // returned by the `sys_stop` opcode.
+    interpreter.pop().expect("Stack should not be empty");
+    interpreter.pop().expect("Stack should not be empty");
+
+    // The code should have added an element to the storage of `to_account`. We run
+    // `mpt_hash_state_trie` to check that.
+    let account_after = AccountRlp {
+        balance: 0x0de0b6b3a7640000u64.into(),
+        code_hash,
+        storage_root: HashedPartialTrie::from(Node::Leaf {
+            nibbles: Nibbles::from_h256_be(keccak([0u8; 32])),
+            value: vec![2],
+        })
+        .hash(),
+        ..AccountRlp::default()
+    };
+    // Now, execute mpt_hash_state_trie.
+    let mpt_hash_state_trie = KERNEL.global_labels["mpt_hash_state_trie"];
+    interpreter.generation_state.registers.program_counter = mpt_hash_state_trie;
+    interpreter.set_is_kernel(true);
+    interpreter.set_context(0);
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(1.into()) // Initial length of the trie data segment, unused.
+        .expect("The stack should not overflow");
+    interpreter.run()?;
+
+    assert_eq!(
+        interpreter.stack().len(),
+        2,
+        "Expected 2 items on stack after hashing, found {:?}",
+        interpreter.stack()
+    );
+
+    let hash = H256::from_uint(&interpreter.stack()[1]);
+
+    let mut expected_state_trie_after = HashedPartialTrie::from(Node::Empty);
+    expected_state_trie_after.insert(addr_nibbles, rlp::encode(&account_after).to_vec());
+
+    let expected_state_trie_hash = expected_state_trie_after.hash();
+    assert_eq!(hash, expected_state_trie_hash);
+    Ok(())
+}
+
+/// Tests an SLOAD within a code similar to the contract code in add11_yml.
+#[test]
+fn sload() -> Result<()> {
+    // We take the same `to` account as in add11_yml.
+    let addr = hex!("095e7baea6a6c7c4c2dfeb977efac326af552d87");
+
+    let addr_hashed = keccak(addr);
+
+    let addr_nibbles = Nibbles::from_bytes_be(addr_hashed.as_bytes()).unwrap();
+
+    // This code is similar to the one in add11_yml's contract, but we pop the added value
+    // and carry out an SLOAD instead of an SSTORE. We also add a PUSH at the end.
+    let code = [
+        0x60, 0x01, 0x60, 0x01, 0x01, 0x50, 0x60, 0x00, 0x54, 0x60, 0x03, 0x00,
+    ];
+    let code_hash = keccak(code);
+
+    let account_before = AccountRlp {
+        balance: 0x0de0b6b3a7640000u64.into(),
+        code_hash,
+        ..AccountRlp::default()
+    };
+
+    let mut state_trie_before = HashedPartialTrie::from(Node::Empty);
+
+    state_trie_before.insert(addr_nibbles, rlp::encode(&account_before).to_vec());
+
+    let trie_inputs = TrieInputs {
+        state_trie: state_trie_before.clone(),
+        transactions_trie: Node::Empty.into(),
+        receipts_trie: Node::Empty.into(),
+        storage_tries: vec![(addr_hashed, Node::Empty.into())],
+    };
+
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+
+    // Prepare the interpreter by inserting the account in the state trie.
+    prepare_interpreter_all_accounts(&mut interpreter, trie_inputs, addr, &code)?;
+    interpreter.run()?;
+
+    // The first two elements in the stack are `success` and `leftover_gas`,
+    // returned by the `sys_stop` opcode.
+    interpreter
+        .pop()
+        .expect("The stack length should not be empty.");
+    interpreter
+        .pop()
+        .expect("The stack length should not be empty.");
+
+    // The SLOAD in the provided code should return 0, since
+    // the storage trie is empty. The last step in the code
+    // pushes the value 3.
+    assert_eq!(interpreter.stack(), vec![0x0.into(), 0x3.into()]);
+    interpreter
+        .pop()
+        .expect("The stack length should not be empty.");
+    interpreter
+        .pop()
+        .expect("The stack length should not be empty.");
+    // Now, execute mpt_hash_state_trie. We check that the state trie has not changed.
+    let mpt_hash_state_trie = KERNEL.global_labels["mpt_hash_state_trie"];
+    interpreter.generation_state.registers.program_counter = mpt_hash_state_trie;
+    interpreter.set_is_kernel(true);
+    interpreter.set_context(0);
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow.");
+    interpreter
+        .push(1.into()) // Initial length of the trie data segment, unused.
+        .expect("The stack should not overflow.");
+    interpreter.run()?;
+
+    assert_eq!(
+        interpreter.stack().len(),
+        2,
+        "Expected 2 items on stack after hashing, found {:?}",
+        interpreter.stack()
+    );
+
+    let trie_data_segment_len = interpreter.stack()[0];
+    assert_eq!(
+        trie_data_segment_len,
+        interpreter
+            .get_memory_segment(Segment::TrieData)
+            .len()
+            .into()
+    );
+
+    let hash = H256::from_uint(&interpreter.stack()[1]);
+
+    let expected_state_trie_hash = state_trie_before.hash();
+    assert_eq!(hash, expected_state_trie_hash);
+    Ok(())
+}
diff --git a/evm/src/cpu/kernel/tests/add11.rs b/evm/src/cpu/kernel/tests/add11.rs
new file mode 100644
index 0000000000..1b7c5b1636
--- /dev/null
+++ b/evm/src/cpu/kernel/tests/add11.rs
@@ -0,0 +1,313 @@
+use std::collections::HashMap;
+use std::str::FromStr;
+
+use eth_trie_utils::nibbles::Nibbles;
+use eth_trie_utils::partial_trie::{HashedPartialTrie, Node, PartialTrie};
+use ethereum_types::{Address, BigEndianHash, H256};
+use hex_literal::hex;
+use keccak_hash::keccak;
+
+use crate::cpu::kernel::aggregator::KERNEL;
+use crate::cpu::kernel::constants::context_metadata::ContextMetadata;
+use crate::cpu::kernel::interpreter::Interpreter;
+use crate::generation::mpt::{AccountRlp, LegacyReceiptRlp};
+use crate::generation::TrieInputs;
+use crate::proof::{BlockHashes, BlockMetadata, TrieRoots};
+use crate::GenerationInputs;
+
+#[test]
+fn test_add11_yml() {
+    let beneficiary = hex!("2adc25665018aa1fe0e6bc666dac8fc2697ff9ba");
+    let sender = hex!("a94f5374fce5edbc8e2a8697c15331677e6ebf0b");
+    let to = hex!("095e7baea6a6c7c4c2dfeb977efac326af552d87");
+
+    let beneficiary_state_key = keccak(beneficiary);
+    let sender_state_key = keccak(sender);
+    let to_hashed = keccak(to);
+
+    let beneficiary_nibbles = Nibbles::from_bytes_be(beneficiary_state_key.as_bytes()).unwrap();
+    let sender_nibbles = Nibbles::from_bytes_be(sender_state_key.as_bytes()).unwrap();
+    let to_nibbles = Nibbles::from_bytes_be(to_hashed.as_bytes()).unwrap();
+
+    let code = [0x60, 0x01, 0x60, 0x01, 0x01, 0x60, 0x00, 0x55, 0x00];
+    let code_hash = keccak(code);
+
+    let mut contract_code = HashMap::new();
+    contract_code.insert(keccak(vec![]), vec![]);
+    contract_code.insert(code_hash, code.to_vec());
+
+    let beneficiary_account_before = AccountRlp {
+        nonce: 1.into(),
+        ..AccountRlp::default()
+    };
+    let sender_account_before = AccountRlp {
+        balance: 0x0de0b6b3a7640000u64.into(),
+        ..AccountRlp::default()
+    };
+    let to_account_before = AccountRlp {
+        balance: 0x0de0b6b3a7640000u64.into(),
+        code_hash,
+        ..AccountRlp::default()
+    };
+
+    let mut state_trie_before = HashedPartialTrie::from(Node::Empty);
+    state_trie_before.insert(
+        beneficiary_nibbles,
+        rlp::encode(&beneficiary_account_before).to_vec(),
+    );
+    state_trie_before.insert(sender_nibbles, rlp::encode(&sender_account_before).to_vec());
+    state_trie_before.insert(to_nibbles, rlp::encode(&to_account_before).to_vec());
+
+    let tries_before = TrieInputs {
+        state_trie: state_trie_before,
+        transactions_trie: Node::Empty.into(),
+        receipts_trie: Node::Empty.into(),
+        storage_tries: vec![(to_hashed, Node::Empty.into())],
+    };
+
+    let txn = hex!("f863800a83061a8094095e7baea6a6c7c4c2dfeb977efac326af552d87830186a0801ba0ffb600e63115a7362e7811894a91d8ba4330e526f22121c994c4692035dfdfd5a06198379fcac8de3dbfac48b165df4bf88e2088f294b61efb9a65fe2281c76e16");
+
+    let gas_used = 0xa868u64.into();
+
+    let expected_state_trie_after = {
+        let beneficiary_account_after = AccountRlp {
+            nonce: 1.into(),
+            ..AccountRlp::default()
+        };
+        let sender_account_after = AccountRlp {
+            balance: 0xde0b6b3a75be550u64.into(),
+            nonce: 1.into(),
+            ..AccountRlp::default()
+        };
+        let to_account_after = AccountRlp {
+            balance: 0xde0b6b3a76586a0u64.into(),
+            code_hash,
+            // Storage map: { 0 => 2 }
+            storage_root: HashedPartialTrie::from(Node::Leaf {
+                nibbles: Nibbles::from_h256_be(keccak([0u8; 32])),
+                value: vec![2],
+            })
+            .hash(),
+            ..AccountRlp::default()
+        };
+
+        let mut expected_state_trie_after = HashedPartialTrie::from(Node::Empty);
+        expected_state_trie_after.insert(
+            beneficiary_nibbles,
+            rlp::encode(&beneficiary_account_after).to_vec(),
+        );
+        expected_state_trie_after
+            .insert(sender_nibbles, rlp::encode(&sender_account_after).to_vec());
+        expected_state_trie_after.insert(to_nibbles, rlp::encode(&to_account_after).to_vec());
+        expected_state_trie_after
+    };
+    let receipt_0 = LegacyReceiptRlp {
+        status: true,
+        cum_gas_used: gas_used,
+        bloom: vec![0; 256].into(),
+        logs: vec![],
+    };
+    let mut receipts_trie = HashedPartialTrie::from(Node::Empty);
+    receipts_trie.insert(
+        Nibbles::from_str("0x80").unwrap(),
+        rlp::encode(&receipt_0).to_vec(),
+    );
+    let transactions_trie: HashedPartialTrie = Node::Leaf {
+        nibbles: Nibbles::from_str("0x80").unwrap(),
+        value: txn.to_vec(),
+    }
+    .into();
+
+    let trie_roots_after = TrieRoots {
+        state_root: expected_state_trie_after.hash(),
+        transactions_root: transactions_trie.hash(),
+        receipts_root: receipts_trie.hash(),
+    };
+
+    let block_metadata = BlockMetadata {
+        block_beneficiary: Address::from(beneficiary),
+        block_timestamp: 0x03e8.into(),
+        block_number: 1.into(),
+        block_difficulty: 0x020000.into(),
+        block_random: H256::from_uint(&0x020000.into()),
+        block_gaslimit: 0xff112233u32.into(),
+        block_chain_id: 1.into(),
+        block_base_fee: 0xa.into(),
+        block_gas_used: gas_used,
+        block_blob_base_fee: 0x2.into(),
+        block_bloom: [0.into(); 8],
+    };
+
+    let tries_inputs = GenerationInputs {
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
+        tries: tries_before,
+        trie_roots_after,
+        contract_code: contract_code.clone(),
+        block_metadata,
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        txn_number_before: 0.into(),
+        gas_used_before: 0.into(),
+        gas_used_after: gas_used,
+        block_hashes: BlockHashes {
+            prev_hashes: vec![H256::default(); 256],
+            cur_hash: H256::default(),
+        },
+    };
+
+    let initial_stack = vec![];
+    let mut interpreter =
+        Interpreter::new_with_generation_inputs_and_kernel(0, initial_stack, tries_inputs);
+
+    let route_txn_label = KERNEL.global_labels["main"];
+    // Switch context and initialize memory with the data we need for the tests.
+    interpreter.generation_state.registers.program_counter = route_txn_label;
+    interpreter.set_context_metadata_field(0, ContextMetadata::GasLimit, 1_000_000.into());
+    interpreter.set_is_kernel(true);
+    interpreter.run().expect("Proving add11 failed.");
+}
+
+#[test]
+fn test_add11_yml_with_exception() {
+    // In this test, we make sure that the user code throws a stack underflow exception.
+    let beneficiary = hex!("2adc25665018aa1fe0e6bc666dac8fc2697ff9ba");
+    let sender = hex!("a94f5374fce5edbc8e2a8697c15331677e6ebf0b");
+    let to = hex!("095e7baea6a6c7c4c2dfeb977efac326af552d87");
+
+    let beneficiary_state_key = keccak(beneficiary);
+    let sender_state_key = keccak(sender);
+    let to_hashed = keccak(to);
+
+    let beneficiary_nibbles = Nibbles::from_bytes_be(beneficiary_state_key.as_bytes()).unwrap();
+    let sender_nibbles = Nibbles::from_bytes_be(sender_state_key.as_bytes()).unwrap();
+    let to_nibbles = Nibbles::from_bytes_be(to_hashed.as_bytes()).unwrap();
+
+    let code = [0x60, 0x01, 0x60, 0x01, 0x01, 0x8e, 0x00];
+    let code_hash = keccak(code);
+
+    let mut contract_code = HashMap::new();
+    contract_code.insert(keccak(vec![]), vec![]);
+    contract_code.insert(code_hash, code.to_vec());
+
+    let beneficiary_account_before = AccountRlp {
+        nonce: 1.into(),
+        ..AccountRlp::default()
+    };
+    let sender_account_before = AccountRlp {
+        balance: 0x0de0b6b3a7640000u64.into(),
+        ..AccountRlp::default()
+    };
+    let to_account_before = AccountRlp {
+        balance: 0x0de0b6b3a7640000u64.into(),
+        code_hash,
+        ..AccountRlp::default()
+    };
+
+    let mut state_trie_before = HashedPartialTrie::from(Node::Empty);
+    state_trie_before.insert(
+        beneficiary_nibbles,
+        rlp::encode(&beneficiary_account_before).to_vec(),
+    );
+    state_trie_before.insert(sender_nibbles, rlp::encode(&sender_account_before).to_vec());
+    state_trie_before.insert(to_nibbles, rlp::encode(&to_account_before).to_vec());
+
+    let tries_before = TrieInputs {
+        state_trie: state_trie_before,
+        transactions_trie: Node::Empty.into(),
+        receipts_trie: Node::Empty.into(),
+        storage_tries: vec![(to_hashed, Node::Empty.into())],
+    };
+
+    let txn = hex!("f863800a83061a8094095e7baea6a6c7c4c2dfeb977efac326af552d87830186a0801ba0ffb600e63115a7362e7811894a91d8ba4330e526f22121c994c4692035dfdfd5a06198379fcac8de3dbfac48b165df4bf88e2088f294b61efb9a65fe2281c76e16");
+    let txn_gas_limit = 400_000;
+    let gas_price = 10;
+
+    // Here, since the transaction fails, it consumes its gas limit, and does nothing else.
+    let expected_state_trie_after = {
+        let beneficiary_account_after = beneficiary_account_before;
+        // This is the only account that changes: the nonce and the balance are updated.
+        let sender_account_after = AccountRlp {
+            balance: sender_account_before.balance - txn_gas_limit * gas_price,
+            nonce: 1.into(),
+            ..AccountRlp::default()
+        };
+        let to_account_after = to_account_before;
+
+        let mut expected_state_trie_after = HashedPartialTrie::from(Node::Empty);
+        expected_state_trie_after.insert(
+            beneficiary_nibbles,
+            rlp::encode(&beneficiary_account_after).to_vec(),
+        );
+        expected_state_trie_after
+            .insert(sender_nibbles, rlp::encode(&sender_account_after).to_vec());
+        expected_state_trie_after.insert(to_nibbles, rlp::encode(&to_account_after).to_vec());
+        expected_state_trie_after
+    };
+
+    let receipt_0 = LegacyReceiptRlp {
+        status: false,
+        cum_gas_used: txn_gas_limit.into(),
+        bloom: vec![0; 256].into(),
+        logs: vec![],
+    };
+    let mut receipts_trie = HashedPartialTrie::from(Node::Empty);
+    receipts_trie.insert(
+        Nibbles::from_str("0x80").unwrap(),
+        rlp::encode(&receipt_0).to_vec(),
+    );
+    let transactions_trie: HashedPartialTrie = Node::Leaf {
+        nibbles: Nibbles::from_str("0x80").unwrap(),
+        value: txn.to_vec(),
+    }
+    .into();
+
+    let trie_roots_after = TrieRoots {
+        state_root: expected_state_trie_after.hash(),
+        transactions_root: transactions_trie.hash(),
+        receipts_root: receipts_trie.hash(),
+    };
+
+    let block_metadata = BlockMetadata {
+        block_beneficiary: Address::from(beneficiary),
+        block_timestamp: 0x03e8.into(),
+        block_number: 1.into(),
+        block_difficulty: 0x020000.into(),
+        block_random: H256::from_uint(&0x020000.into()),
+        block_gaslimit: 0xff112233u32.into(),
+        block_chain_id: 1.into(),
+        block_base_fee: 0xa.into(),
+        block_gas_used: txn_gas_limit.into(),
+        block_blob_base_fee: 0x2.into(),
+        block_bloom: [0.into(); 8],
+    };
+
+    let tries_inputs = GenerationInputs {
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
+        tries: tries_before,
+        trie_roots_after,
+        contract_code: contract_code.clone(),
+        block_metadata,
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        txn_number_before: 0.into(),
+        gas_used_before: 0.into(),
+        gas_used_after: txn_gas_limit.into(),
+        block_hashes: BlockHashes {
+            prev_hashes: vec![H256::default(); 256],
+            cur_hash: H256::default(),
+        },
+    };
+
+    let initial_stack = vec![];
+    let mut interpreter =
+        Interpreter::new_with_generation_inputs_and_kernel(0, initial_stack, tries_inputs);
+
+    let route_txn_label = KERNEL.global_labels["main"];
+    // Switch context and initialize memory with the data we need for the tests.
+    interpreter.generation_state.registers.program_counter = route_txn_label;
+    interpreter.set_context_metadata_field(0, ContextMetadata::GasLimit, 1_000_000.into());
+    interpreter.set_is_kernel(true);
+    interpreter
+        .run()
+        .expect("Proving add11 with exception failed.");
+}
diff --git a/evm/src/cpu/kernel/tests/balance.rs b/evm/src/cpu/kernel/tests/balance.rs
index 40214405c7..b393c05cf5 100644
--- a/evm/src/cpu/kernel/tests/balance.rs
+++ b/evm/src/cpu/kernel/tests/balance.rs
@@ -1,4 +1,4 @@
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
 use ethereum_types::{Address, BigEndianHash, H256, U256};
 use keccak_hash::keccak;
@@ -7,8 +7,9 @@ use rand::{thread_rng, Rng};
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::cpu::kernel::tests::account_code::initialize_mpts;
 use crate::cpu::kernel::tests::mpt::nibbles_64;
-use crate::generation::mpt::{all_mpt_prover_inputs_reversed, AccountRlp};
+use crate::generation::mpt::AccountRlp;
 use crate::Node;
 
 // Test account with a given code hash.
@@ -28,19 +29,12 @@ fn prepare_interpreter(
     address: Address,
     account: &AccountRlp,
 ) -> Result<()> {
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
     let mpt_insert_state_trie = KERNEL.global_labels["mpt_insert_state_trie"];
     let mpt_hash_state_trie = KERNEL.global_labels["mpt_hash_state_trie"];
     let mut state_trie: HashedPartialTrie = Default::default();
     let trie_inputs = Default::default();
 
-    interpreter.generation_state.registers.program_counter = load_all_mpts;
-    interpreter.push(0xDEADBEEFu32.into());
-
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    initialize_mpts(interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     let k = nibbles_64(U256::from_big_endian(
@@ -64,9 +58,15 @@ fn prepare_interpreter(
     trie_data.push(account.code_hash.into_uint());
     let trie_data_len = trie_data.len().into();
     interpreter.set_global_metadata_field(GlobalMetadata::TrieDataSize, trie_data_len);
-    interpreter.push(0xDEADBEEFu32.into());
-    interpreter.push(value_ptr.into()); // value_ptr
-    interpreter.push(k.try_into_u256().unwrap()); // key
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(value_ptr.into())
+        .expect("The stack should not overflow"); // value_ptr
+    interpreter
+        .push(k.try_into_u256().unwrap())
+        .expect("The stack should not overflow"); // key
 
     interpreter.run()?;
     assert_eq!(
@@ -78,16 +78,21 @@ fn prepare_interpreter(
 
     // Now, execute mpt_hash_state_trie.
     interpreter.generation_state.registers.program_counter = mpt_hash_state_trie;
-    interpreter.push(0xDEADBEEFu32.into());
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(1.into()) // Initial trie data segment size, unused.
+        .expect("The stack should not overflow");
     interpreter.run()?;
 
     assert_eq!(
         interpreter.stack().len(),
-        1,
-        "Expected 1 item on stack after hashing, found {:?}",
+        2,
+        "Expected 2 items on stack after hashing, found {:?}",
         interpreter.stack()
     );
-    let hash = H256::from_uint(&interpreter.stack()[0]);
+    let hash = H256::from_uint(&interpreter.stack()[1]);
 
     state_trie.insert(k, rlp::encode(account).to_vec());
     let expected_state_trie_hash = state_trie.hash();
@@ -109,10 +114,15 @@ fn test_balance() -> Result<()> {
 
     // Test `balance`
     interpreter.generation_state.registers.program_counter = KERNEL.global_labels["balance"];
-    interpreter.pop();
+    interpreter.pop().expect("The stack should not be empty");
+    interpreter.pop().expect("The stack should not be empty");
     assert!(interpreter.stack().is_empty());
-    interpreter.push(0xDEADBEEFu32.into());
-    interpreter.push(U256::from_big_endian(address.as_bytes()));
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(U256::from_big_endian(address.as_bytes()))
+        .expect("The stack should not overflow");
     interpreter.run()?;
 
     assert_eq!(interpreter.stack(), vec![balance]);
diff --git a/evm/src/cpu/kernel/tests/bignum/mod.rs b/evm/src/cpu/kernel/tests/bignum/mod.rs
index 9e15d96f02..0cc6f0dc1b 100644
--- a/evm/src/cpu/kernel/tests/bignum/mod.rs
+++ b/evm/src/cpu/kernel/tests/bignum/mod.rs
@@ -1,4 +1,4 @@
-use std::cmp::Ordering;
+use core::cmp::Ordering;
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
diff --git a/evm/src/cpu/kernel/tests/blake2_f.rs b/evm/src/cpu/kernel/tests/blake2_f.rs
index b12c9f32a6..c5d800c5b6 100644
--- a/evm/src/cpu/kernel/tests/blake2_f.rs
+++ b/evm/src/cpu/kernel/tests/blake2_f.rs
@@ -5,6 +5,8 @@ use crate::cpu::kernel::interpreter::{
 };
 use crate::memory::segments::Segment::KernelGeneral;
 
+type ConvertedBlakeInputs = (u32, [u64; 8], [u64; 16], u64, u64, bool);
+
 fn reverse_bytes_u64(input: u64) -> u64 {
     let mut result = 0;
     for i in 0..8 {
@@ -13,7 +15,7 @@ fn reverse_bytes_u64(input: u64) -> u64 {
     result
 }
 
-fn convert_input(input: &str) -> Result<(u32, [u64; 8], [u64; 16], u64, u64, bool)> {
+fn convert_input(input: &str) -> Result<ConvertedBlakeInputs> {
     let rounds = u32::from_str_radix(&input[..8], 16).unwrap();
 
     let mut h = [0u64; 8];
diff --git a/evm/src/cpu/kernel/tests/bn254.rs b/evm/src/cpu/kernel/tests/bn254.rs
index 5ed60e7a32..8a90ff2479 100644
--- a/evm/src/cpu/kernel/tests/bn254.rs
+++ b/evm/src/cpu/kernel/tests/bn254.rs
@@ -117,8 +117,8 @@ fn run_bn_frob_fp12(f: Fp12<BN254>, n: usize) -> Fp12<BN254> {
         segment: BnPairing,
         memory: vec![(ptr, f.to_stack().to_vec())],
     };
-    let interpeter: Interpreter = run_interpreter_with_memory(setup).unwrap();
-    let output: Vec<U256> = interpeter.extract_kernel_memory(BnPairing, ptr..ptr + 12);
+    let interpreter: Interpreter = run_interpreter_with_memory(setup).unwrap();
+    let output: Vec<U256> = interpreter.extract_kernel_memory(BnPairing, ptr..ptr + 12);
     Fp12::<BN254>::from_stack(&output)
 }
 
diff --git a/evm/src/cpu/kernel/tests/core/access_lists.rs b/evm/src/cpu/kernel/tests/core/access_lists.rs
index c62d48656b..69dd2d27d4 100644
--- a/evm/src/cpu/kernel/tests/core/access_lists.rs
+++ b/evm/src/cpu/kernel/tests/core/access_lists.rs
@@ -9,7 +9,7 @@ use crate::cpu::kernel::constants::global_metadata::GlobalMetadata::{
     AccessedAddressesLen, AccessedStorageKeysLen,
 };
 use crate::cpu::kernel::interpreter::Interpreter;
-use crate::memory::segments::Segment::{AccessedAddresses, AccessedStorageKeys, GlobalMetadata};
+use crate::memory::segments::Segment::{AccessedAddresses, AccessedStorageKeys};
 use crate::witness::memory::MemoryAddress;
 
 #[test]
@@ -42,17 +42,16 @@ fn test_insert_accessed_addresses() -> Result<()> {
             .set(MemoryAddress::new(0, AccessedAddresses, i), addr);
     }
     interpreter.generation_state.memory.set(
-        MemoryAddress::new(0, GlobalMetadata, AccessedAddressesLen as usize),
+        MemoryAddress::new_bundle(U256::from(AccessedAddressesLen as usize)).unwrap(),
         U256::from(n),
     );
     interpreter.run()?;
     assert_eq!(interpreter.stack(), &[U256::zero()]);
     assert_eq!(
-        interpreter.generation_state.memory.get(MemoryAddress::new(
-            0,
-            GlobalMetadata,
-            AccessedAddressesLen as usize
-        )),
+        interpreter
+            .generation_state
+            .memory
+            .get(MemoryAddress::new_bundle(U256::from(AccessedAddressesLen as usize)).unwrap()),
         U256::from(n)
     );
 
@@ -67,17 +66,16 @@ fn test_insert_accessed_addresses() -> Result<()> {
             .set(MemoryAddress::new(0, AccessedAddresses, i), addr);
     }
     interpreter.generation_state.memory.set(
-        MemoryAddress::new(0, GlobalMetadata, AccessedAddressesLen as usize),
+        MemoryAddress::new_bundle(U256::from(AccessedAddressesLen as usize)).unwrap(),
         U256::from(n),
     );
     interpreter.run()?;
     assert_eq!(interpreter.stack(), &[U256::one()]);
     assert_eq!(
-        interpreter.generation_state.memory.get(MemoryAddress::new(
-            0,
-            GlobalMetadata,
-            AccessedAddressesLen as usize
-        )),
+        interpreter
+            .generation_state
+            .memory
+            .get(MemoryAddress::new_bundle(U256::from(AccessedAddressesLen as usize)).unwrap()),
         U256::from(n + 1)
     );
     assert_eq!(
@@ -134,17 +132,16 @@ fn test_insert_accessed_storage_keys() -> Result<()> {
         );
     }
     interpreter.generation_state.memory.set(
-        MemoryAddress::new(0, GlobalMetadata, AccessedStorageKeysLen as usize),
+        MemoryAddress::new_bundle(U256::from(AccessedStorageKeysLen as usize)).unwrap(),
         U256::from(3 * n),
     );
     interpreter.run()?;
     assert_eq!(interpreter.stack(), &[storage_key_in_list.2, U256::zero()]);
     assert_eq!(
-        interpreter.generation_state.memory.get(MemoryAddress::new(
-            0,
-            GlobalMetadata,
-            AccessedStorageKeysLen as usize
-        )),
+        interpreter
+            .generation_state
+            .memory
+            .get(MemoryAddress::new_bundle(U256::from(AccessedStorageKeysLen as usize)).unwrap()),
         U256::from(3 * n)
     );
 
@@ -172,7 +169,7 @@ fn test_insert_accessed_storage_keys() -> Result<()> {
         );
     }
     interpreter.generation_state.memory.set(
-        MemoryAddress::new(0, GlobalMetadata, AccessedStorageKeysLen as usize),
+        MemoryAddress::new_bundle(U256::from(AccessedStorageKeysLen as usize)).unwrap(),
         U256::from(3 * n),
     );
     interpreter.run()?;
@@ -181,11 +178,10 @@ fn test_insert_accessed_storage_keys() -> Result<()> {
         &[storage_key_not_in_list.2, U256::one()]
     );
     assert_eq!(
-        interpreter.generation_state.memory.get(MemoryAddress::new(
-            0,
-            GlobalMetadata,
-            AccessedStorageKeysLen as usize
-        )),
+        interpreter
+            .generation_state
+            .memory
+            .get(MemoryAddress::new_bundle(U256::from(AccessedStorageKeysLen as usize)).unwrap()),
         U256::from(3 * (n + 1))
     );
     assert_eq!(
diff --git a/evm/src/cpu/kernel/tests/core/jumpdest_analysis.rs b/evm/src/cpu/kernel/tests/core/jumpdest_analysis.rs
index 022a18d729..d704cc198d 100644
--- a/evm/src/cpu/kernel/tests/core/jumpdest_analysis.rs
+++ b/evm/src/cpu/kernel/tests/core/jumpdest_analysis.rs
@@ -1,8 +1,12 @@
+use std::collections::{BTreeSet, HashMap};
+
 use anyhow::Result;
+use ethereum_types::U256;
 
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::interpreter::Interpreter;
 use crate::cpu::kernel::opcodes::{get_opcode, get_push_opcode};
+use crate::witness::operation::CONTEXT_SCALING_FACTOR;
 
 #[test]
 fn test_jumpdest_analysis() -> Result<()> {
@@ -25,18 +29,94 @@ fn test_jumpdest_analysis() -> Result<()> {
         jumpdest,
     ];
 
-    let expected_jumpdest_bits = vec![false, true, false, false, false, true, false, true];
+    let jumpdest_bits = vec![false, true, false, false, false, true, false, true];
 
     // Contract creation transaction.
-    let initial_stack = vec![0xDEADBEEFu32.into(), code.len().into(), CONTEXT.into()];
+    let initial_stack = vec![
+        0xDEADBEEFu32.into(),
+        code.len().into(),
+        U256::from(CONTEXT) << CONTEXT_SCALING_FACTOR,
+    ];
     let mut interpreter = Interpreter::new_with_kernel(jumpdest_analysis, initial_stack);
     interpreter.set_code(CONTEXT, code);
-    interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![]);
+    interpreter.set_jumpdest_analysis_inputs(HashMap::from([(
+        3,
+        BTreeSet::from_iter(
+            jumpdest_bits
+                .iter()
+                .enumerate()
+                .filter(|&(_, &x)| x)
+                .map(|(i, _)| i),
+        ),
+    )]));
+
     assert_eq!(
-        interpreter.get_jumpdest_bits(CONTEXT),
-        expected_jumpdest_bits
+        interpreter.generation_state.jumpdest_table,
+        // Context 3 has jumpdest 1, 5, 7. All have proof 0 and hence
+        // the list [proof_0, jumpdest_0, ... ] is [0, 1, 0, 5, 0, 7]
+        Some(HashMap::from([(3, vec![0, 1, 0, 5, 0, 7])]))
     );
 
+    interpreter.run()?;
+    assert_eq!(interpreter.stack(), vec![]);
+
+    assert_eq!(jumpdest_bits, interpreter.get_jumpdest_bits(3));
+
+    Ok(())
+}
+
+#[test]
+fn test_packed_verification() -> Result<()> {
+    let jumpdest_analysis = KERNEL.global_labels["jumpdest_analysis"];
+    const CONTEXT: usize = 3; // arbitrary
+
+    let add = get_opcode("ADD");
+    let jumpdest = get_opcode("JUMPDEST");
+
+    // The last push(i=0) is 0x5f which is not a valid opcode. However, this
+    // is still meaningful for the test and makes things easier
+    let mut code: Vec<u8> = std::iter::once(add)
+        .chain(
+            (0..=31)
+                .rev()
+                .map(get_push_opcode)
+                .chain(std::iter::once(jumpdest)),
+        )
+        .collect();
+
+    let jumpdest_bits: Vec<bool> = std::iter::repeat(false)
+        .take(33)
+        .chain(std::iter::once(true))
+        .collect();
+
+    // Contract creation transaction.
+    let initial_stack = vec![
+        0xDEADBEEFu32.into(),
+        code.len().into(),
+        U256::from(CONTEXT) << CONTEXT_SCALING_FACTOR,
+    ];
+    let mut interpreter = Interpreter::new_with_kernel(jumpdest_analysis, initial_stack.clone());
+    interpreter.set_code(CONTEXT, code.clone());
+    interpreter.generation_state.jumpdest_table = Some(HashMap::from([(3, vec![1, 33])]));
+
+    interpreter.run()?;
+
+    assert_eq!(jumpdest_bits, interpreter.get_jumpdest_bits(CONTEXT));
+
+    // If we add 1 to each opcode the jumpdest at position 32 is never a valid jumpdest
+    for i in 1..=32 {
+        code[i] += 1;
+        let mut interpreter =
+            Interpreter::new_with_kernel(jumpdest_analysis, initial_stack.clone());
+        interpreter.set_code(CONTEXT, code.clone());
+        interpreter.generation_state.jumpdest_table = Some(HashMap::from([(3, vec![1, 33])]));
+
+        interpreter.run()?;
+
+        assert!(interpreter.get_jumpdest_bits(CONTEXT).is_empty());
+
+        code[i] -= 1;
+    }
+
     Ok(())
 }
diff --git a/evm/src/cpu/kernel/tests/exp.rs b/evm/src/cpu/kernel/tests/exp.rs
index 1655064e7c..482c6b7216 100644
--- a/evm/src/cpu/kernel/tests/exp.rs
+++ b/evm/src/cpu/kernel/tests/exp.rs
@@ -3,7 +3,7 @@ use ethereum_types::U256;
 use rand::{thread_rng, Rng};
 
 use crate::cpu::kernel::aggregator::KERNEL;
-use crate::cpu::kernel::interpreter::{run, run_interpreter};
+use crate::cpu::kernel::interpreter::{run_interpreter, Interpreter};
 
 #[test]
 fn test_exp() -> Result<()> {
@@ -15,33 +15,28 @@ fn test_exp() -> Result<()> {
 
     // Random input
     let initial_stack = vec![0xDEADBEEFu32.into(), b, a];
-    let stack_with_kernel = run_interpreter(exp, initial_stack)?.stack().to_vec();
-    let initial_stack = vec![b, a];
-    let code = [0xa, 0x63, 0xde, 0xad, 0xbe, 0xef, 0x56]; // EXP, PUSH4 deadbeef, JUMP
-    let stack_with_opcode = run(&code, 0, initial_stack, &KERNEL.prover_inputs)?
-        .stack()
-        .to_vec();
-    assert_eq!(stack_with_kernel, stack_with_opcode);
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack.clone());
+
+    let stack_with_kernel = run_interpreter(exp, initial_stack)?.stack();
+
+    let expected_exp = a.overflowing_pow(b).0;
+    assert_eq!(stack_with_kernel, vec![expected_exp]);
 
     // 0 base
     let initial_stack = vec![0xDEADBEEFu32.into(), b, U256::zero()];
-    let stack_with_kernel = run_interpreter(exp, initial_stack)?.stack().to_vec();
-    let initial_stack = vec![b, U256::zero()];
-    let code = [0xa, 0x63, 0xde, 0xad, 0xbe, 0xef, 0x56]; // EXP, PUSH4 deadbeef, JUMP
-    let stack_with_opcode = run(&code, 0, initial_stack, &KERNEL.prover_inputs)?
-        .stack()
-        .to_vec();
-    assert_eq!(stack_with_kernel, stack_with_opcode);
+    let stack_with_kernel = run_interpreter(exp, initial_stack)?.stack();
+
+    let expected_exp = U256::zero().overflowing_pow(b).0;
+    assert_eq!(stack_with_kernel, vec![expected_exp]);
 
     // 0 exponent
     let initial_stack = vec![0xDEADBEEFu32.into(), U256::zero(), a];
-    let stack_with_kernel = run_interpreter(exp, initial_stack)?.stack().to_vec();
-    let initial_stack = vec![U256::zero(), a];
-    let code = [0xa, 0x63, 0xde, 0xad, 0xbe, 0xef, 0x56]; // EXP, PUSH4 deadbeef, JUMP
-    let stack_with_opcode = run(&code, 0, initial_stack, &KERNEL.prover_inputs)?
-        .stack()
-        .to_vec();
-    assert_eq!(stack_with_kernel, stack_with_opcode);
+    interpreter.set_is_kernel(true);
+    interpreter.set_context(0);
+    let stack_with_kernel = run_interpreter(exp, initial_stack)?.stack();
+
+    let expected_exp = 1.into();
+    assert_eq!(stack_with_kernel, vec![expected_exp]);
 
     Ok(())
 }
diff --git a/evm/src/cpu/kernel/tests/hash.rs b/evm/src/cpu/kernel/tests/hash.rs
index 9b96de91be..6371f0a8a3 100644
--- a/evm/src/cpu/kernel/tests/hash.rs
+++ b/evm/src/cpu/kernel/tests/hash.rs
@@ -65,7 +65,7 @@ fn prepare_test<T>(
     // Load the message into the kernel.
     let interpreter_setup = make_interpreter_setup(message, hash_fn_label, hash_input_virt);
 
-    // Run the interpeter
+    // Run the interpreter
     let result = run_interpreter_with_memory(interpreter_setup).unwrap();
 
     Ok((expected, result.stack().to_vec()))
diff --git a/evm/src/cpu/kernel/tests/kernel_consistency.rs b/evm/src/cpu/kernel/tests/kernel_consistency.rs
new file mode 100644
index 0000000000..b02c11a234
--- /dev/null
+++ b/evm/src/cpu/kernel/tests/kernel_consistency.rs
@@ -0,0 +1,13 @@
+use anyhow::Result;
+
+use crate::cpu::kernel::aggregator::{combined_kernel, KERNEL};
+
+#[test]
+fn test_kernel_code_hash_consistency() -> Result<()> {
+    for _ in 0..10 {
+        let kernel2 = combined_kernel();
+        assert_eq!(kernel2.code_hash, KERNEL.code_hash);
+    }
+
+    Ok(())
+}
diff --git a/evm/src/cpu/kernel/tests/mod.rs b/evm/src/cpu/kernel/tests/mod.rs
index b66c016266..7581eefe75 100644
--- a/evm/src/cpu/kernel/tests/mod.rs
+++ b/evm/src/cpu/kernel/tests/mod.rs
@@ -1,4 +1,5 @@
 mod account_code;
+mod add11;
 mod balance;
 mod bignum;
 mod blake2_f;
@@ -9,6 +10,7 @@ mod core;
 mod ecc;
 mod exp;
 mod hash;
+mod kernel_consistency;
 mod log;
 mod mpt;
 mod packing;
diff --git a/evm/src/cpu/kernel/tests/mpt/delete.rs b/evm/src/cpu/kernel/tests/mpt/delete.rs
index 074eea26ef..0d4d5e71f7 100644
--- a/evm/src/cpu/kernel/tests/mpt/delete.rs
+++ b/evm/src/cpu/kernel/tests/mpt/delete.rs
@@ -1,13 +1,15 @@
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use eth_trie_utils::nibbles::Nibbles;
 use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
-use ethereum_types::{BigEndianHash, H256};
+use ethereum_types::{BigEndianHash, H256, U512};
+use rand::random;
 
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::cpu::kernel::tests::account_code::initialize_mpts;
 use crate::cpu::kernel::tests::mpt::{nibbles_64, test_account_1_rlp, test_account_2};
-use crate::generation::mpt::{all_mpt_prover_inputs_reversed, AccountRlp};
+use crate::generation::mpt::AccountRlp;
 use crate::generation::TrieInputs;
 use crate::Node;
 
@@ -47,6 +49,32 @@ fn mpt_delete_branch_into_hash() -> Result<()> {
     test_state_trie(state_trie, nibbles_64(0xADE), test_account_2())
 }
 
+#[test]
+fn test_after_mpt_delete_extension_branch() -> Result<()> {
+    let hash = Node::Hash(H256::random());
+    let branch = Node::Branch {
+        children: std::array::from_fn(|i| {
+            if i == 0 {
+                Node::Empty.into()
+            } else {
+                hash.clone().into()
+            }
+        }),
+        value: vec![],
+    };
+    let nibbles = Nibbles::from_bytes_be(&random::<[u8; 5]>()).unwrap();
+    let state_trie = Node::Extension {
+        nibbles,
+        child: branch.into(),
+    }
+    .into();
+    let key = nibbles.merge_nibbles(&Nibbles {
+        packed: U512::zero(),
+        count: 64 - nibbles.count,
+    });
+    test_state_trie(state_trie, key, test_account_2())
+}
+
 /// Note: The account's storage_root is ignored, as we can't insert a new storage_root without the
 /// accompanying trie data. An empty trie's storage_root is used instead.
 fn test_state_trie(
@@ -65,16 +93,14 @@ fn test_state_trie(
         receipts_trie: Default::default(),
         storage_tries: vec![],
     };
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
     let mpt_insert_state_trie = KERNEL.global_labels["mpt_insert_state_trie"];
     let mpt_delete = KERNEL.global_labels["mpt_delete"];
     let mpt_hash_state_trie = KERNEL.global_labels["mpt_hash_state_trie"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs).map_err(|_| anyhow!("Invalid MPT data"))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     // Next, execute mpt_insert_state_trie.
@@ -95,9 +121,15 @@ fn test_state_trie(
     trie_data.push(account.code_hash.into_uint());
     let trie_data_len = trie_data.len().into();
     interpreter.set_global_metadata_field(GlobalMetadata::TrieDataSize, trie_data_len);
-    interpreter.push(0xDEADBEEFu32.into());
-    interpreter.push(value_ptr.into()); // value_ptr
-    interpreter.push(k.try_into_u256().unwrap()); // key
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(value_ptr.into())
+        .expect("The stack should not overflow"); // value_ptr
+    interpreter
+        .push(k.try_into_u256().unwrap())
+        .expect("The stack should not overflow"); // key
     interpreter.run()?;
     assert_eq!(
         interpreter.stack().len(),
@@ -109,20 +141,34 @@ fn test_state_trie(
     // Next, execute mpt_delete, deleting the account we just inserted.
     let state_trie_ptr = interpreter.get_global_metadata_field(GlobalMetadata::StateTrieRoot);
     interpreter.generation_state.registers.program_counter = mpt_delete;
-    interpreter.push(0xDEADBEEFu32.into());
-    interpreter.push(k.try_into_u256().unwrap());
-    interpreter.push(64.into());
-    interpreter.push(state_trie_ptr);
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(k.try_into_u256().unwrap())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(64.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(state_trie_ptr)
+        .expect("The stack should not overflow");
     interpreter.run()?;
-    let state_trie_ptr = interpreter.pop();
+    let state_trie_ptr = interpreter.pop().expect("The stack should not be empty");
     interpreter.set_global_metadata_field(GlobalMetadata::StateTrieRoot, state_trie_ptr);
 
     // Now, execute mpt_hash_state_trie.
     interpreter.generation_state.registers.program_counter = mpt_hash_state_trie;
-    interpreter.push(0xDEADBEEFu32.into());
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(1.into()) // Initial length of the trie data segment, unused.
+        .expect("The stack should not overflow");
     interpreter.run()?;
 
-    let state_trie_hash = H256::from_uint(&interpreter.pop());
+    let state_trie_hash =
+        H256::from_uint(&interpreter.pop().expect("The stack should not be empty"));
     let expected_state_trie_hash = state_trie.hash();
     assert_eq!(state_trie_hash, expected_state_trie_hash);
 
diff --git a/evm/src/cpu/kernel/tests/mpt/hash.rs b/evm/src/cpu/kernel/tests/mpt/hash.rs
index 05077a94da..a06dd2a0b5 100644
--- a/evm/src/cpu/kernel/tests/mpt/hash.rs
+++ b/evm/src/cpu/kernel/tests/mpt/hash.rs
@@ -1,11 +1,11 @@
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use eth_trie_utils::partial_trie::PartialTrie;
 use ethereum_types::{BigEndianHash, H256};
 
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::cpu::kernel::tests::account_code::initialize_mpts;
 use crate::cpu::kernel::tests::mpt::{extension_to_leaf, test_account_1_rlp, test_account_2_rlp};
-use crate::generation::mpt::all_mpt_prover_inputs_reversed;
 use crate::generation::TrieInputs;
 use crate::Node;
 
@@ -108,28 +108,31 @@ fn mpt_hash_branch_to_leaf() -> Result<()> {
 }
 
 fn test_state_trie(trie_inputs: TrieInputs) -> Result<()> {
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
     let mpt_hash_state_trie = KERNEL.global_labels["mpt_hash_state_trie"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs).map_err(|_| anyhow!("Invalid MPT data"))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     // Now, execute mpt_hash_state_trie.
     interpreter.generation_state.registers.program_counter = mpt_hash_state_trie;
-    interpreter.push(0xDEADBEEFu32.into());
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(1.into()) // Initial length of the trie data segment, unused.
+        .expect("The stack should not overflow");
     interpreter.run()?;
 
     assert_eq!(
         interpreter.stack().len(),
-        1,
-        "Expected 1 item on stack, found {:?}",
+        2,
+        "Expected 2 items on stack, found {:?}",
         interpreter.stack()
     );
-    let hash = H256::from_uint(&interpreter.stack()[0]);
+    let hash = H256::from_uint(&interpreter.stack()[1]);
     let expected_state_trie_hash = trie_inputs.state_trie.hash();
     assert_eq!(hash, expected_state_trie_hash);
 
diff --git a/evm/src/cpu/kernel/tests/mpt/hex_prefix.rs b/evm/src/cpu/kernel/tests/mpt/hex_prefix.rs
index c13b812220..e51e60ab46 100644
--- a/evm/src/cpu/kernel/tests/mpt/hex_prefix.rs
+++ b/evm/src/cpu/kernel/tests/mpt/hex_prefix.rs
@@ -1,7 +1,9 @@
 use anyhow::Result;
+use ethereum_types::U256;
 
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::memory::segments::Segment;
 
 #[test]
 fn hex_prefix_even_nonterminated() -> Result<()> {
@@ -11,11 +13,11 @@ fn hex_prefix_even_nonterminated() -> Result<()> {
     let terminated = 0.into();
     let packed_nibbles = 0xABCDEF.into();
     let num_nibbles = 6.into();
-    let rlp_pos = 0.into();
+    let rlp_pos = U256::from(Segment::RlpRaw as usize);
     let initial_stack = vec![retdest, terminated, packed_nibbles, num_nibbles, rlp_pos];
     let mut interpreter = Interpreter::new_with_kernel(hex_prefix, initial_stack);
     interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![5.into()]);
+    assert_eq!(interpreter.stack(), vec![rlp_pos + U256::from(5)]);
 
     assert_eq!(
         interpreter.get_rlp_memory(),
@@ -39,11 +41,11 @@ fn hex_prefix_odd_terminated() -> Result<()> {
     let terminated = 1.into();
     let packed_nibbles = 0xABCDE.into();
     let num_nibbles = 5.into();
-    let rlp_pos = 0.into();
+    let rlp_pos = U256::from(Segment::RlpRaw as usize);
     let initial_stack = vec![retdest, terminated, packed_nibbles, num_nibbles, rlp_pos];
     let mut interpreter = Interpreter::new_with_kernel(hex_prefix, initial_stack);
     interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![4.into()]);
+    assert_eq!(interpreter.stack(), vec![rlp_pos + U256::from(4)]);
 
     assert_eq!(
         interpreter.get_rlp_memory(),
@@ -66,11 +68,14 @@ fn hex_prefix_odd_terminated_tiny() -> Result<()> {
     let terminated = 1.into();
     let packed_nibbles = 0xA.into();
     let num_nibbles = 1.into();
-    let rlp_pos = 2.into();
+    let rlp_pos = U256::from(Segment::RlpRaw as usize + 2);
     let initial_stack = vec![retdest, terminated, packed_nibbles, num_nibbles, rlp_pos];
     let mut interpreter = Interpreter::new_with_kernel(hex_prefix, initial_stack);
     interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![3.into()]);
+    assert_eq!(
+        interpreter.stack(),
+        vec![U256::from(Segment::RlpRaw as usize + 3)]
+    );
 
     assert_eq!(
         interpreter.get_rlp_memory(),
diff --git a/evm/src/cpu/kernel/tests/mpt/insert.rs b/evm/src/cpu/kernel/tests/mpt/insert.rs
index 6fd95a30b9..19b82f74a2 100644
--- a/evm/src/cpu/kernel/tests/mpt/insert.rs
+++ b/evm/src/cpu/kernel/tests/mpt/insert.rs
@@ -1,4 +1,4 @@
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use eth_trie_utils::nibbles::Nibbles;
 use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
 use ethereum_types::{BigEndianHash, H256};
@@ -6,10 +6,11 @@ use ethereum_types::{BigEndianHash, H256};
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::cpu::kernel::tests::account_code::initialize_mpts;
 use crate::cpu::kernel::tests::mpt::{
     nibbles_64, nibbles_count, test_account_1_rlp, test_account_2,
 };
-use crate::generation::mpt::{all_mpt_prover_inputs_reversed, AccountRlp};
+use crate::generation::mpt::AccountRlp;
 use crate::generation::TrieInputs;
 use crate::Node;
 
@@ -168,15 +169,13 @@ fn test_state_trie(
         receipts_trie: Default::default(),
         storage_tries: vec![],
     };
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
     let mpt_insert_state_trie = KERNEL.global_labels["mpt_insert_state_trie"];
     let mpt_hash_state_trie = KERNEL.global_labels["mpt_hash_state_trie"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs).map_err(|_| anyhow!("Invalid MPT data"))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     // Next, execute mpt_insert_state_trie.
@@ -197,9 +196,15 @@ fn test_state_trie(
     trie_data.push(account.code_hash.into_uint());
     let trie_data_len = trie_data.len().into();
     interpreter.set_global_metadata_field(GlobalMetadata::TrieDataSize, trie_data_len);
-    interpreter.push(0xDEADBEEFu32.into());
-    interpreter.push(value_ptr.into()); // value_ptr
-    interpreter.push(k.try_into_u256().unwrap()); // key
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(value_ptr.into())
+        .expect("The stack should not overflow"); // value_ptr
+    interpreter
+        .push(k.try_into_u256().unwrap())
+        .expect("The stack should not overflow"); // key
 
     interpreter.run()?;
     assert_eq!(
@@ -211,16 +216,21 @@ fn test_state_trie(
 
     // Now, execute mpt_hash_state_trie.
     interpreter.generation_state.registers.program_counter = mpt_hash_state_trie;
-    interpreter.push(0xDEADBEEFu32.into());
+    interpreter
+        .push(0xDEADBEEFu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(1.into()) // Initial length of the trie data segment, unused.
+        .expect("The stack should not overflow");
     interpreter.run()?;
 
     assert_eq!(
         interpreter.stack().len(),
-        1,
-        "Expected 1 item on stack after hashing, found {:?}",
+        2,
+        "Expected 2 items on stack after hashing, found {:?}",
         interpreter.stack()
     );
-    let hash = H256::from_uint(&interpreter.stack()[0]);
+    let hash = H256::from_uint(&interpreter.stack()[1]);
 
     state_trie.insert(k, rlp::encode(&account).to_vec());
     let expected_state_trie_hash = state_trie.hash();
diff --git a/evm/src/cpu/kernel/tests/mpt/load.rs b/evm/src/cpu/kernel/tests/mpt/load.rs
index ae0bfa3bc8..bff1d8cb39 100644
--- a/evm/src/cpu/kernel/tests/mpt/load.rs
+++ b/evm/src/cpu/kernel/tests/mpt/load.rs
@@ -1,17 +1,16 @@
 use std::str::FromStr;
 
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use eth_trie_utils::nibbles::Nibbles;
 use eth_trie_utils::partial_trie::HashedPartialTrie;
 use ethereum_types::{BigEndianHash, H256, U256};
 use hex_literal::hex;
 
-use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cpu::kernel::constants::trie_type::PartialTrieType;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::cpu::kernel::tests::account_code::initialize_mpts;
 use crate::cpu::kernel::tests::mpt::{extension_to_leaf, test_account_1, test_account_1_rlp};
-use crate::generation::mpt::all_mpt_prover_inputs_reversed;
 use crate::generation::TrieInputs;
 use crate::Node;
 
@@ -24,17 +23,13 @@ fn load_all_mpts_empty() -> Result<()> {
         storage_tries: vec![],
     };
 
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
-
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
-    assert_eq!(interpreter.get_trie_data(), vec![]);
+    // We need to have the first element in `TrieData` be 0.
+    assert_eq!(interpreter.get_trie_data(), vec![0.into()]);
 
     assert_eq!(
         interpreter.get_global_metadata_field(GlobalMetadata::StateTrieRoot),
@@ -65,14 +60,9 @@ fn load_all_mpts_leaf() -> Result<()> {
         storage_tries: vec![],
     };
 
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
-
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     let type_leaf = U256::from(PartialTrieType::Leaf as u32);
@@ -116,14 +106,9 @@ fn load_all_mpts_hash() -> Result<()> {
         storage_tries: vec![],
     };
 
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
-
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     let type_hash = U256::from(PartialTrieType::Hash as u32);
@@ -159,14 +144,9 @@ fn load_all_mpts_empty_branch() -> Result<()> {
         storage_tries: vec![],
     };
 
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
-
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     let type_branch = U256::from(PartialTrieType::Branch as u32);
@@ -216,14 +196,9 @@ fn load_all_mpts_ext_to_leaf() -> Result<()> {
         storage_tries: vec![],
     };
 
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
-
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     let type_extension = U256::from(PartialTrieType::Extension as u32);
@@ -255,8 +230,6 @@ fn load_all_mpts_ext_to_leaf() -> Result<()> {
 
 #[test]
 fn load_mpt_txn_trie() -> Result<()> {
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
-
     let txn = hex!("f860010a830186a094095e7baea6a6c7c4c2dfeb977efac326af552e89808025a04a223955b0bd3827e3740a9a427d0ea43beb5bafa44a0204bf0a3306c8219f7ba0502c32d78f233e9e7ce9f5df3b576556d5d49731e0678fd5a068cdf359557b5b").to_vec();
 
     let trie_inputs = TrieInputs {
@@ -269,12 +242,9 @@ fn load_mpt_txn_trie() -> Result<()> {
         storage_tries: vec![],
     };
 
-    let initial_stack = vec![0xDEADBEEFu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     let mut expected_trie_data = vec![
diff --git a/evm/src/cpu/kernel/tests/mpt/read.rs b/evm/src/cpu/kernel/tests/mpt/read.rs
index f9ae94f03b..16206d1390 100644
--- a/evm/src/cpu/kernel/tests/mpt/read.rs
+++ b/evm/src/cpu/kernel/tests/mpt/read.rs
@@ -1,11 +1,11 @@
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use ethereum_types::BigEndianHash;
 
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::cpu::kernel::tests::account_code::initialize_mpts;
 use crate::cpu::kernel::tests::mpt::{extension_to_leaf, test_account_1, test_account_1_rlp};
-use crate::generation::mpt::all_mpt_prover_inputs_reversed;
 use crate::generation::TrieInputs;
 
 #[test]
@@ -17,23 +17,27 @@ fn mpt_read() -> Result<()> {
         storage_tries: vec![],
     };
 
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
     let mpt_read = KERNEL.global_labels["mpt_read"];
 
-    let initial_stack = vec![0xdeadbeefu32.into()];
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    let initial_stack = vec![];
+    let mut interpreter = Interpreter::new_with_kernel(0, initial_stack);
+    initialize_mpts(&mut interpreter, &trie_inputs);
     assert_eq!(interpreter.stack(), vec![]);
 
     // Now, execute mpt_read on the state trie.
     interpreter.generation_state.registers.program_counter = mpt_read;
-    interpreter.push(0xdeadbeefu32.into());
-    interpreter.push(0xABCDEFu64.into());
-    interpreter.push(6.into());
-    interpreter.push(interpreter.get_global_metadata_field(GlobalMetadata::StateTrieRoot));
+    interpreter
+        .push(0xdeadbeefu32.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(0xABCDEFu64.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(6.into())
+        .expect("The stack should not overflow");
+    interpreter
+        .push(interpreter.get_global_metadata_field(GlobalMetadata::StateTrieRoot))
+        .expect("The stack should not overflow");
     interpreter.run()?;
 
     assert_eq!(interpreter.stack().len(), 1);
diff --git a/evm/src/cpu/kernel/tests/packing.rs b/evm/src/cpu/kernel/tests/packing.rs
index 43ca9b5fc2..0eb09cf7a6 100644
--- a/evm/src/cpu/kernel/tests/packing.rs
+++ b/evm/src/cpu/kernel/tests/packing.rs
@@ -5,66 +5,6 @@ use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::interpreter::Interpreter;
 use crate::memory::segments::Segment;
 
-#[test]
-fn test_mload_packing_1_byte() -> Result<()> {
-    let mload_packing = KERNEL.global_labels["mload_packing"];
-
-    let retdest = 0xDEADBEEFu32.into();
-    let len = 1.into();
-    let offset = 2.into();
-    let segment = (Segment::RlpRaw as u32).into();
-    let context = 0.into();
-    let initial_stack = vec![retdest, len, offset, segment, context];
-
-    let mut interpreter = Interpreter::new_with_kernel(mload_packing, initial_stack);
-    interpreter.set_rlp_memory(vec![0, 0, 0xAB]);
-
-    interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![0xAB.into()]);
-
-    Ok(())
-}
-
-#[test]
-fn test_mload_packing_3_bytes() -> Result<()> {
-    let mload_packing = KERNEL.global_labels["mload_packing"];
-
-    let retdest = 0xDEADBEEFu32.into();
-    let len = 3.into();
-    let offset = 2.into();
-    let segment = (Segment::RlpRaw as u32).into();
-    let context = 0.into();
-    let initial_stack = vec![retdest, len, offset, segment, context];
-
-    let mut interpreter = Interpreter::new_with_kernel(mload_packing, initial_stack);
-    interpreter.set_rlp_memory(vec![0, 0, 0xAB, 0xCD, 0xEF]);
-
-    interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![0xABCDEF.into()]);
-
-    Ok(())
-}
-
-#[test]
-fn test_mload_packing_32_bytes() -> Result<()> {
-    let mload_packing = KERNEL.global_labels["mload_packing"];
-
-    let retdest = 0xDEADBEEFu32.into();
-    let len = 32.into();
-    let offset = 0.into();
-    let segment = (Segment::RlpRaw as u32).into();
-    let context = 0.into();
-    let initial_stack = vec![retdest, len, offset, segment, context];
-
-    let mut interpreter = Interpreter::new_with_kernel(mload_packing, initial_stack);
-    interpreter.set_rlp_memory(vec![0xFF; 32]);
-
-    interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![U256::MAX]);
-
-    Ok(())
-}
-
 #[test]
 fn test_mstore_unpacking() -> Result<()> {
     let mstore_unpacking = KERNEL.global_labels["mstore_unpacking"];
@@ -72,15 +12,13 @@ fn test_mstore_unpacking() -> Result<()> {
     let retdest = 0xDEADBEEFu32.into();
     let len = 4.into();
     let value = 0xABCD1234u32.into();
-    let offset = 0.into();
-    let segment = (Segment::TxnData as u32).into();
-    let context = 0.into();
-    let initial_stack = vec![retdest, len, value, offset, segment, context];
+    let addr = (Segment::TxnData as u64).into();
+    let initial_stack = vec![retdest, len, value, addr];
 
     let mut interpreter = Interpreter::new_with_kernel(mstore_unpacking, initial_stack);
 
     interpreter.run()?;
-    assert_eq!(interpreter.stack(), vec![4.into()]);
+    assert_eq!(interpreter.stack(), vec![addr + U256::from(4)]);
     assert_eq!(
         &interpreter.get_txn_data(),
         &[0xAB.into(), 0xCD.into(), 0x12.into(), 0x34.into()]
diff --git a/evm/src/cpu/kernel/tests/receipt.rs b/evm/src/cpu/kernel/tests/receipt.rs
index f82bbcda43..7d00cb2746 100644
--- a/evm/src/cpu/kernel/tests/receipt.rs
+++ b/evm/src/cpu/kernel/tests/receipt.rs
@@ -1,4 +1,4 @@
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use ethereum_types::{Address, U256};
 use hex_literal::hex;
 use keccak_hash::keccak;
@@ -8,7 +8,8 @@ use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cpu::kernel::constants::txn_fields::NormalizedTxnField;
 use crate::cpu::kernel::interpreter::Interpreter;
-use crate::generation::mpt::{all_mpt_prover_inputs_reversed, LegacyReceiptRlp, LogRlp};
+use crate::cpu::kernel::tests::account_code::initialize_mpts;
+use crate::generation::mpt::{LegacyReceiptRlp, LogRlp};
 use crate::memory::segments::Segment;
 
 #[test]
@@ -59,7 +60,6 @@ fn test_process_receipt() -> Result<()> {
     );
     interpreter.set_txn_field(NormalizedTxnField::GasLimit, U256::from(5000));
     interpreter.set_memory_segment(Segment::TxnBloom, vec![0.into(); 256]);
-    interpreter.set_memory_segment(Segment::BlockBloom, vec![0.into(); 256]);
     interpreter.set_memory_segment(Segment::Logs, vec![0.into()]);
     interpreter.set_global_metadata_field(GlobalMetadata::LogsPayloadLen, 58.into());
     interpreter.set_global_metadata_field(GlobalMetadata::LogsLen, U256::from(1));
@@ -127,7 +127,7 @@ fn test_receipt_encoding() -> Result<()> {
     // Get the expected RLP encoding.
     let expected_rlp = rlp::encode(&rlp::encode(&receipt_1));
 
-    let initial_stack: Vec<U256> = vec![retdest, 0.into(), 0.into()];
+    let initial_stack: Vec<U256> = vec![retdest, 0.into(), 0.into(), 0.into()];
     let mut interpreter = Interpreter::new_with_kernel(encode_receipt, initial_stack);
 
     // Write data to memory.
@@ -194,7 +194,7 @@ fn test_receipt_encoding() -> Result<()> {
     interpreter.set_memory_segment(Segment::TrieData, receipt);
 
     interpreter.run()?;
-    let rlp_pos = interpreter.pop();
+    let rlp_pos = interpreter.pop().expect("The stack should not be empty");
 
     let rlp_read: Vec<u8> = interpreter.get_rlp_memory();
 
@@ -265,7 +265,6 @@ fn test_receipt_bloom_filter() -> Result<()> {
     logs.extend(cur_data);
     // The Bloom filter initialization is required for this test to ensure we have the correct length for the filters. Otherwise, some trailing zeroes could be missing.
     interpreter.set_memory_segment(Segment::TxnBloom, vec![0.into(); 256]); // Initialize transaction Bloom filter.
-    interpreter.set_memory_segment(Segment::BlockBloom, vec![0.into(); 256]); // Initialize block Bloom filter.
     interpreter.set_memory_segment(Segment::LogsData, logs);
     interpreter.set_memory_segment(Segment::Logs, vec![0.into()]);
     interpreter.set_global_metadata_field(GlobalMetadata::LogsLen, U256::from(1));
@@ -296,7 +295,9 @@ fn test_receipt_bloom_filter() -> Result<()> {
         .map(U256::from);
     logs2.extend(cur_data);
 
-    interpreter.push(retdest);
+    interpreter
+        .push(retdest)
+        .expect("The stack should not overflow");
     interpreter.generation_state.registers.program_counter = logs_bloom;
     interpreter.set_memory_segment(Segment::TxnBloom, vec![0.into(); 256]); // Initialize transaction Bloom filter.
     interpreter.set_memory_segment(Segment::LogsData, logs2);
@@ -327,15 +328,6 @@ fn test_receipt_bloom_filter() -> Result<()> {
 
     assert_eq!(second_bloom_bytes, second_loaded_bloom);
 
-    // Check the final block Bloom.
-    let block_bloom = hex!("00000000000000000000000000000000000000000000000000800000000000000040000000005000000000000000000000000000000000000000000000000000000000000000000000000000000000000002000000000000000000000000000008000000000000000000000000000000000000000001000000080008000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000500000000000000000000000000000002000040000000000000000000000000000000000000000000000008000000000000000000000100000000000000000000000000020000000000008000000000000000000000000").to_vec();
-    let loaded_block_bloom: Vec<u8> = interpreter
-        .get_memory_segment(Segment::BlockBloom)
-        .into_iter()
-        .map(|elt| elt.0[0] as u8)
-        .collect();
-
-    assert_eq!(block_bloom, loaded_block_bloom);
     Ok(())
 }
 
@@ -349,7 +341,6 @@ fn test_mpt_insert_receipt() -> Result<()> {
 
     let retdest = 0xDEADBEEFu32.into();
     let trie_inputs = Default::default();
-    let load_all_mpts = KERNEL.global_labels["load_all_mpts"];
     let mpt_insert = KERNEL.global_labels["mpt_insert_receipt_trie"];
     let num_topics = 3; // Both transactions have the same number of topics.
     let payload_len = 423; // Total payload length for each receipt.
@@ -417,14 +408,8 @@ fn test_mpt_insert_receipt() -> Result<()> {
     receipt.push(num_logs.into()); // num_logs
     receipt.extend(logs_0.clone());
 
-    // First, we load all mpts.
-    let initial_stack: Vec<U256> = vec![retdest];
-
-    let mut interpreter = Interpreter::new_with_kernel(load_all_mpts, initial_stack);
-    interpreter.generation_state.mpt_prover_inputs =
-        all_mpt_prover_inputs_reversed(&trie_inputs)
-            .map_err(|err| anyhow!("Invalid MPT data: {:?}", err))?;
-    interpreter.run()?;
+    let mut interpreter = Interpreter::new_with_kernel(0, vec![]);
+    initialize_mpts(&mut interpreter, &trie_inputs);
 
     // If TrieData is empty, we need to push 0 because the first value is always 0.
     let mut cur_trie_data = interpreter.get_memory_segment(Segment::TrieData);
@@ -441,7 +426,9 @@ fn test_mpt_insert_receipt() -> Result<()> {
         num_nibbles.into(),
     ];
     for i in 0..initial_stack.len() {
-        interpreter.push(initial_stack[i]);
+        interpreter
+            .push(initial_stack[i])
+            .expect("The stack should not overflow");
     }
 
     interpreter.generation_state.registers.program_counter = mpt_insert;
@@ -511,7 +498,9 @@ fn test_mpt_insert_receipt() -> Result<()> {
         num_nibbles.into(),
     ];
     for i in 0..initial_stack2.len() {
-        interpreter.push(initial_stack2[i]);
+        interpreter
+            .push(initial_stack2[i])
+            .expect("The stack should not overflow");
     }
     cur_trie_data.extend(receipt_1);
 
@@ -524,10 +513,15 @@ fn test_mpt_insert_receipt() -> Result<()> {
     // Finally, check that the hashes correspond.
     let mpt_hash_receipt = KERNEL.global_labels["mpt_hash_receipt_trie"];
     interpreter.generation_state.registers.program_counter = mpt_hash_receipt;
-    interpreter.push(retdest);
+    interpreter
+        .push(retdest)
+        .expect("The stack should not overflow");
+    interpreter
+        .push(1.into()) // Initial length of the trie data segment, unused.; // Initial length of the trie data segment, unused.
+        .expect("The stack should not overflow");
     interpreter.run()?;
     assert_eq!(
-        interpreter.stack()[0],
+        interpreter.stack()[1],
         U256::from(hex!(
             "da46cdd329bfedace32da95f2b344d314bc6f55f027d65f9f4ac04ee425e1f98"
         ))
@@ -570,7 +564,6 @@ fn test_bloom_two_logs() -> Result<()> {
     ];
     let mut interpreter = Interpreter::new_with_kernel(logs_bloom, initial_stack);
     interpreter.set_memory_segment(Segment::TxnBloom, vec![0.into(); 256]); // Initialize transaction Bloom filter.
-    interpreter.set_memory_segment(Segment::BlockBloom, vec![0.into(); 256]); // Initialize block Bloom filter.
     interpreter.set_memory_segment(Segment::LogsData, logs);
     interpreter.set_memory_segment(Segment::Logs, vec![0.into(), 4.into()]);
     interpreter.set_global_metadata_field(GlobalMetadata::LogsLen, U256::from(2));
@@ -588,7 +581,7 @@ fn test_bloom_two_logs() -> Result<()> {
     Ok(())
 }
 
-pub fn logs_bloom_bytes_fn(logs_list: Vec<(Vec<u8>, Vec<Vec<u8>>)>) -> [u8; 256] {
+fn logs_bloom_bytes_fn(logs_list: Vec<(Vec<u8>, Vec<Vec<u8>>)>) -> [u8; 256] {
     // The first element of logs_list.
     let mut bloom = [0_u8; 256];
 
diff --git a/evm/src/cpu/kernel/tests/rlp/decode.rs b/evm/src/cpu/kernel/tests/rlp/decode.rs
index a1ca3609ad..1f3260e56f 100644
--- a/evm/src/cpu/kernel/tests/rlp/decode.rs
+++ b/evm/src/cpu/kernel/tests/rlp/decode.rs
@@ -1,20 +1,25 @@
 use anyhow::Result;
+use ethereum_types::U256;
 
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::memory::segments::Segment;
 
 #[test]
 fn test_decode_rlp_string_len_short() -> Result<()> {
     let decode_rlp_string_len = KERNEL.global_labels["decode_rlp_string_len"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into(), 2.into()];
+    let initial_stack = vec![
+        0xDEADBEEFu32.into(),
+        U256::from(Segment::RlpRaw as usize + 2),
+    ];
     let mut interpreter = Interpreter::new_with_kernel(decode_rlp_string_len, initial_stack);
 
     // A couple dummy bytes, followed by "0x70" which is its own encoding.
     interpreter.set_rlp_memory(vec![123, 234, 0x70]);
 
     interpreter.run()?;
-    let expected_stack = vec![1.into(), 2.into()]; // len, pos
+    let expected_stack = vec![1.into(), U256::from(Segment::RlpRaw as usize + 2)]; // len, pos
     assert_eq!(interpreter.stack(), expected_stack);
 
     Ok(())
@@ -24,14 +29,17 @@ fn test_decode_rlp_string_len_short() -> Result<()> {
 fn test_decode_rlp_string_len_medium() -> Result<()> {
     let decode_rlp_string_len = KERNEL.global_labels["decode_rlp_string_len"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into(), 2.into()];
+    let initial_stack = vec![
+        0xDEADBEEFu32.into(),
+        U256::from(Segment::RlpRaw as usize + 2),
+    ];
     let mut interpreter = Interpreter::new_with_kernel(decode_rlp_string_len, initial_stack);
 
     // A couple dummy bytes, followed by the RLP encoding of "1 2 3 4 5".
     interpreter.set_rlp_memory(vec![123, 234, 0x85, 1, 2, 3, 4, 5]);
 
     interpreter.run()?;
-    let expected_stack = vec![5.into(), 3.into()]; // len, pos
+    let expected_stack = vec![5.into(), U256::from(Segment::RlpRaw as usize + 3)]; // len, pos
     assert_eq!(interpreter.stack(), expected_stack);
 
     Ok(())
@@ -41,7 +49,10 @@ fn test_decode_rlp_string_len_medium() -> Result<()> {
 fn test_decode_rlp_string_len_long() -> Result<()> {
     let decode_rlp_string_len = KERNEL.global_labels["decode_rlp_string_len"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into(), 2.into()];
+    let initial_stack = vec![
+        0xDEADBEEFu32.into(),
+        U256::from(Segment::RlpRaw as usize + 2),
+    ];
     let mut interpreter = Interpreter::new_with_kernel(decode_rlp_string_len, initial_stack);
 
     // The RLP encoding of the string "1 2 3 ... 56".
@@ -52,7 +63,7 @@ fn test_decode_rlp_string_len_long() -> Result<()> {
     ]);
 
     interpreter.run()?;
-    let expected_stack = vec![56.into(), 4.into()]; // len, pos
+    let expected_stack = vec![56.into(), U256::from(Segment::RlpRaw as usize + 4)]; // len, pos
     assert_eq!(interpreter.stack(), expected_stack);
 
     Ok(())
@@ -62,14 +73,14 @@ fn test_decode_rlp_string_len_long() -> Result<()> {
 fn test_decode_rlp_list_len_short() -> Result<()> {
     let decode_rlp_list_len = KERNEL.global_labels["decode_rlp_list_len"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into(), 0.into()];
+    let initial_stack = vec![0xDEADBEEFu32.into(), U256::from(Segment::RlpRaw as usize)];
     let mut interpreter = Interpreter::new_with_kernel(decode_rlp_list_len, initial_stack);
 
     // The RLP encoding of [1, 2, [3, 4]].
     interpreter.set_rlp_memory(vec![0xc5, 1, 2, 0xc2, 3, 4]);
 
     interpreter.run()?;
-    let expected_stack = vec![5.into(), 1.into()]; // len, pos
+    let expected_stack = vec![5.into(), U256::from(Segment::RlpRaw as usize + 1)]; // len, pos
     assert_eq!(interpreter.stack(), expected_stack);
 
     Ok(())
@@ -79,7 +90,7 @@ fn test_decode_rlp_list_len_short() -> Result<()> {
 fn test_decode_rlp_list_len_long() -> Result<()> {
     let decode_rlp_list_len = KERNEL.global_labels["decode_rlp_list_len"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into(), 0.into()];
+    let initial_stack = vec![0xDEADBEEFu32.into(), U256::from(Segment::RlpRaw as usize)];
     let mut interpreter = Interpreter::new_with_kernel(decode_rlp_list_len, initial_stack);
 
     // The RLP encoding of [1, ..., 56].
@@ -90,7 +101,7 @@ fn test_decode_rlp_list_len_long() -> Result<()> {
     ]);
 
     interpreter.run()?;
-    let expected_stack = vec![56.into(), 2.into()]; // len, pos
+    let expected_stack = vec![56.into(), U256::from(Segment::RlpRaw as usize + 2)]; // len, pos
     assert_eq!(interpreter.stack(), expected_stack);
 
     Ok(())
@@ -100,14 +111,14 @@ fn test_decode_rlp_list_len_long() -> Result<()> {
 fn test_decode_rlp_scalar() -> Result<()> {
     let decode_rlp_scalar = KERNEL.global_labels["decode_rlp_scalar"];
 
-    let initial_stack = vec![0xDEADBEEFu32.into(), 0.into()];
+    let initial_stack = vec![0xDEADBEEFu32.into(), U256::from(Segment::RlpRaw as usize)];
     let mut interpreter = Interpreter::new_with_kernel(decode_rlp_scalar, initial_stack);
 
     // The RLP encoding of "12 34 56".
     interpreter.set_rlp_memory(vec![0x83, 0x12, 0x34, 0x56]);
 
     interpreter.run()?;
-    let expected_stack = vec![0x123456.into(), 4.into()]; // scalar, pos
+    let expected_stack = vec![0x123456.into(), U256::from(Segment::RlpRaw as usize + 4)]; // scalar, pos
     assert_eq!(interpreter.stack(), expected_stack);
 
     Ok(())
diff --git a/evm/src/cpu/kernel/tests/rlp/encode.rs b/evm/src/cpu/kernel/tests/rlp/encode.rs
index 2771dea0f9..d28a763fe8 100644
--- a/evm/src/cpu/kernel/tests/rlp/encode.rs
+++ b/evm/src/cpu/kernel/tests/rlp/encode.rs
@@ -1,7 +1,9 @@
 use anyhow::Result;
+use ethereum_types::U256;
 
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::interpreter::Interpreter;
+use crate::memory::segments::Segment;
 
 #[test]
 fn test_encode_rlp_scalar_small() -> Result<()> {
@@ -9,12 +11,12 @@ fn test_encode_rlp_scalar_small() -> Result<()> {
 
     let retdest = 0xDEADBEEFu32.into();
     let scalar = 42.into();
-    let pos = 2.into();
+    let pos = U256::from(Segment::RlpRaw as usize + 2);
     let initial_stack = vec![retdest, scalar, pos];
     let mut interpreter = Interpreter::new_with_kernel(encode_rlp_scalar, initial_stack);
 
     interpreter.run()?;
-    let expected_stack = vec![3.into()]; // pos' = pos + rlp_len = 2 + 1
+    let expected_stack = vec![pos + U256::from(1)]; // pos' = pos + rlp_len = 2 + 1
     let expected_rlp = vec![0, 0, 42];
     assert_eq!(interpreter.stack(), expected_stack);
     assert_eq!(interpreter.get_rlp_memory(), expected_rlp);
@@ -28,12 +30,12 @@ fn test_encode_rlp_scalar_medium() -> Result<()> {
 
     let retdest = 0xDEADBEEFu32.into();
     let scalar = 0x12345.into();
-    let pos = 2.into();
+    let pos = U256::from(Segment::RlpRaw as usize + 2);
     let initial_stack = vec![retdest, scalar, pos];
     let mut interpreter = Interpreter::new_with_kernel(encode_rlp_scalar, initial_stack);
 
     interpreter.run()?;
-    let expected_stack = vec![6.into()]; // pos' = pos + rlp_len = 2 + 4
+    let expected_stack = vec![pos + U256::from(4)]; // pos' = pos + rlp_len = 2 + 4
     let expected_rlp = vec![0, 0, 0x80 + 3, 0x01, 0x23, 0x45];
     assert_eq!(interpreter.stack(), expected_stack);
     assert_eq!(interpreter.get_rlp_memory(), expected_rlp);
@@ -43,16 +45,16 @@ fn test_encode_rlp_scalar_medium() -> Result<()> {
 
 #[test]
 fn test_encode_rlp_160() -> Result<()> {
-    let encode_rlp_160 = KERNEL.global_labels["encode_rlp_160"];
+    let encode_rlp_fixed = KERNEL.global_labels["encode_rlp_fixed"];
 
     let retdest = 0xDEADBEEFu32.into();
     let string = 0x12345.into();
-    let pos = 0.into();
-    let initial_stack = vec![retdest, string, pos];
-    let mut interpreter = Interpreter::new_with_kernel(encode_rlp_160, initial_stack);
+    let pos = U256::from(Segment::RlpRaw as usize);
+    let initial_stack = vec![retdest, string, pos, U256::from(20)];
+    let mut interpreter = Interpreter::new_with_kernel(encode_rlp_fixed, initial_stack);
 
     interpreter.run()?;
-    let expected_stack = vec![(1 + 20).into()]; // pos'
+    let expected_stack = vec![pos + U256::from(1 + 20)]; // pos'
     #[rustfmt::skip]
     let expected_rlp = vec![0x80 + 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x01, 0x23, 0x45];
     assert_eq!(interpreter.stack(), expected_stack);
@@ -63,16 +65,16 @@ fn test_encode_rlp_160() -> Result<()> {
 
 #[test]
 fn test_encode_rlp_256() -> Result<()> {
-    let encode_rlp_256 = KERNEL.global_labels["encode_rlp_256"];
+    let encode_rlp_fixed = KERNEL.global_labels["encode_rlp_fixed"];
 
     let retdest = 0xDEADBEEFu32.into();
     let string = 0x12345.into();
-    let pos = 0.into();
-    let initial_stack = vec![retdest, string, pos];
-    let mut interpreter = Interpreter::new_with_kernel(encode_rlp_256, initial_stack);
+    let pos = U256::from(Segment::RlpRaw as usize);
+    let initial_stack = vec![retdest, string, pos, U256::from(32)];
+    let mut interpreter = Interpreter::new_with_kernel(encode_rlp_fixed, initial_stack);
 
     interpreter.run()?;
-    let expected_stack = vec![(1 + 32).into()]; // pos'
+    let expected_stack = vec![pos + U256::from(1 + 32)]; // pos'
     #[rustfmt::skip]
     let expected_rlp = vec![0x80 + 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x01, 0x23, 0x45];
     assert_eq!(interpreter.stack(), expected_stack);
@@ -86,8 +88,8 @@ fn test_prepend_rlp_list_prefix_small() -> Result<()> {
     let prepend_rlp_list_prefix = KERNEL.global_labels["prepend_rlp_list_prefix"];
 
     let retdest = 0xDEADBEEFu32.into();
-    let start_pos = 9.into();
-    let end_pos = (9 + 5).into();
+    let start_pos = U256::from(Segment::RlpRaw as usize + 9);
+    let end_pos = U256::from(Segment::RlpRaw as usize + 9 + 5);
     let initial_stack = vec![retdest, start_pos, end_pos];
     let mut interpreter = Interpreter::new_with_kernel(prepend_rlp_list_prefix, initial_stack);
     interpreter.set_rlp_memory(vec![
@@ -100,7 +102,7 @@ fn test_prepend_rlp_list_prefix_small() -> Result<()> {
     interpreter.run()?;
 
     let expected_rlp_len = 6.into();
-    let expected_start_pos = 8.into();
+    let expected_start_pos = U256::from(Segment::RlpRaw as usize + 8);
     let expected_stack = vec![expected_rlp_len, expected_start_pos];
     let expected_rlp = vec![0, 0, 0, 0, 0, 0, 0, 0, 0xc0 + 5, 1, 2, 3, 4, 5];
 
@@ -115,8 +117,8 @@ fn test_prepend_rlp_list_prefix_large() -> Result<()> {
     let prepend_rlp_list_prefix = KERNEL.global_labels["prepend_rlp_list_prefix"];
 
     let retdest = 0xDEADBEEFu32.into();
-    let start_pos = 9.into();
-    let end_pos = (9 + 60).into();
+    let start_pos = U256::from(Segment::RlpRaw as usize + 9);
+    let end_pos = U256::from(Segment::RlpRaw as usize + 9 + 60);
     let initial_stack = vec![retdest, start_pos, end_pos];
     let mut interpreter = Interpreter::new_with_kernel(prepend_rlp_list_prefix, initial_stack);
 
@@ -136,7 +138,7 @@ fn test_prepend_rlp_list_prefix_large() -> Result<()> {
     interpreter.run()?;
 
     let expected_rlp_len = 62.into();
-    let expected_start_pos = 7.into();
+    let expected_start_pos = U256::from(Segment::RlpRaw as usize + 7);
     let expected_stack = vec![expected_rlp_len, expected_start_pos];
 
     #[rustfmt::skip]
diff --git a/evm/src/cpu/kernel/tests/signed_syscalls.rs b/evm/src/cpu/kernel/tests/signed_syscalls.rs
index 93391cf635..74b3524b00 100644
--- a/evm/src/cpu/kernel/tests/signed_syscalls.rs
+++ b/evm/src/cpu/kernel/tests/signed_syscalls.rs
@@ -120,7 +120,9 @@ fn run_test(fn_label: &str, expected_fn: fn(U256, U256) -> U256, opname: &str) {
             let mut interpreter = Interpreter::new_with_kernel(fn_label, stack);
             interpreter.run().unwrap();
             assert_eq!(interpreter.stack_len(), 1usize, "unexpected stack size");
-            let output = interpreter.stack_top();
+            let output = interpreter
+                .stack_top()
+                .expect("The stack should not be empty.");
             let expected_output = expected_fn(x, y);
             assert_eq!(
                 output, expected_output,
diff --git a/evm/src/cpu/kernel/utils.rs b/evm/src/cpu/kernel/utils.rs
index 3470904cdd..18b5f54822 100644
--- a/evm/src/cpu/kernel/utils.rs
+++ b/evm/src/cpu/kernel/utils.rs
@@ -1,4 +1,4 @@
-use std::fmt::Debug;
+use core::fmt::Debug;
 
 use ethereum_types::U256;
 use plonky2_util::ceil_div_usize;
@@ -31,7 +31,7 @@ pub(crate) fn u256_to_trimmed_be_bytes(u256: &U256) -> Vec<u8> {
     (0..num_bytes).rev().map(|i| u256.byte(i)).collect()
 }
 
-pub(crate) fn u256_from_bool(b: bool) -> U256 {
+pub(crate) const fn u256_from_bool(b: bool) -> U256 {
     if b {
         U256::one()
     } else {
diff --git a/evm/src/cpu/membus.rs b/evm/src/cpu/membus.rs
index 10dc25a4ca..6ce845613d 100644
--- a/evm/src/cpu/membus.rs
+++ b/evm/src/cpu/membus.rs
@@ -7,13 +7,14 @@ use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer
 use crate::cpu::columns::CpuColumnsView;
 
 /// General-purpose memory channels; they can read and write to all contexts/segments/addresses.
-pub const NUM_GP_CHANNELS: usize = 5;
+pub(crate) const NUM_GP_CHANNELS: usize = 3;
 
+/// Indices for code and general purpose memory channels.
 pub mod channel_indices {
-    use std::ops::Range;
+    use core::ops::Range;
 
-    pub const CODE: usize = 0;
-    pub const GP: Range<usize> = CODE + 1..(CODE + 1) + super::NUM_GP_CHANNELS;
+    pub(crate) const CODE: usize = 0;
+    pub(crate) const GP: Range<usize> = CODE + 1..(CODE + 1) + super::NUM_GP_CHANNELS;
 }
 
 /// Total memory channels used by the CPU table. This includes all the `GP_MEM_CHANNELS` as well as
@@ -28,34 +29,39 @@ pub mod channel_indices {
 ///  - the address is `program_counter`,
 ///  - the value must fit in one byte (in the least-significant position) and its eight bits are
 ///    found in `opcode_bits`.
+///
+/// There is also a partial channel, which shares its values with another general purpose channel.
+///
 /// These limitations save us numerous columns in the CPU table.
-pub const NUM_CHANNELS: usize = channel_indices::GP.end;
+pub(crate) const NUM_CHANNELS: usize = channel_indices::GP.end + 1;
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints regarding the membus.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
     // Validate `lv.code_context`.
     // It should be 0 if in kernel mode and `lv.context` if in user mode.
-    // Note: This doesn't need to be filtered to CPU cycles, as this should also be satisfied
-    // during Kernel bootstrapping.
     yield_constr.constraint(lv.code_context - (P::ONES - lv.is_kernel_mode) * lv.context);
 
     // Validate `channel.used`. It should be binary.
     for channel in lv.mem_channels {
         yield_constr.constraint(channel.used * (channel.used - P::ONES));
     }
+
+    // Validate `partial_channel.used`. It should be binary.
+    yield_constr.constraint(lv.partial_channel.used * (lv.partial_channel.used - P::ONES));
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints regarding the membus.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
     // Validate `lv.code_context`.
     // It should be 0 if in kernel mode and `lv.context` if in user mode.
-    // Note: This doesn't need to be filtered to CPU cycles, as this should also be satisfied
-    // during Kernel bootstrapping.
     let diff = builder.sub_extension(lv.context, lv.code_context);
     let constr = builder.mul_sub_extension(lv.is_kernel_mode, lv.context, diff);
     yield_constr.constraint(builder, constr);
@@ -65,4 +71,14 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_sub_extension(channel.used, channel.used, channel.used);
         yield_constr.constraint(builder, constr);
     }
+
+    // Validate `partial_channel.used`. It should be binary.
+    {
+        let constr = builder.mul_sub_extension(
+            lv.partial_channel.used,
+            lv.partial_channel.used,
+            lv.partial_channel.used,
+        );
+        yield_constr.constraint(builder, constr);
+    }
 }
diff --git a/evm/src/cpu/memio.rs b/evm/src/cpu/memio.rs
index f70f3fdb67..924f030f5f 100644
--- a/evm/src/cpu/memio.rs
+++ b/evm/src/cpu/memio.rs
@@ -5,40 +5,52 @@ use plonky2::field::types::Field;
 use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 
+use super::cpu_stark::get_addr;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::CpuColumnsView;
-use crate::cpu::membus::NUM_GP_CHANNELS;
 use crate::cpu::stack;
 use crate::memory::segments::Segment;
 
-fn get_addr<T: Copy>(lv: &CpuColumnsView<T>) -> (T, T, T) {
-    let addr_context = lv.mem_channels[0].value[0];
-    let addr_segment = lv.mem_channels[1].value[0];
-    let addr_virtual = lv.mem_channels[2].value[0];
-    (addr_context, addr_segment, addr_virtual)
+const fn get_addr_load<T: Copy>(lv: &CpuColumnsView<T>) -> (T, T, T) {
+    get_addr(lv, 0)
+}
+const fn get_addr_store<T: Copy>(lv: &CpuColumnsView<T>) -> (T, T, T) {
+    get_addr(lv, 1)
 }
 
+/// Evaluates constraints for MLOAD_GENERAL.
 fn eval_packed_load<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    // The opcode for MLOAD_GENERAL is 0xfb. If the operation is MLOAD_GENERAL, lv.opcode_bits[0] = 1
+    // The opcode for MLOAD_GENERAL is 0xfb. If the operation is MLOAD_GENERAL, lv.opcode_bits[0] = 1.
     let filter = lv.op.m_op_general * lv.opcode_bits[0];
 
-    let (addr_context, addr_segment, addr_virtual) = get_addr(lv);
+    let (addr_context, addr_segment, addr_virtual) = get_addr_load(lv);
 
-    let load_channel = lv.mem_channels[3];
+    // Check that we are loading the correct value from the correct address.
+    let load_channel = lv.mem_channels[1];
     yield_constr.constraint(filter * (load_channel.used - P::ONES));
     yield_constr.constraint(filter * (load_channel.is_read - P::ONES));
     yield_constr.constraint(filter * (load_channel.addr_context - addr_context));
     yield_constr.constraint(filter * (load_channel.addr_segment - addr_segment));
     yield_constr.constraint(filter * (load_channel.addr_virtual - addr_virtual));
 
+    // Constrain the new top of the stack.
+    for (&limb_loaded, &limb_new_top) in load_channel
+        .value
+        .iter()
+        .zip(nv.mem_channels[0].value.iter())
+    {
+        yield_constr.constraint(filter * (limb_loaded - limb_new_top));
+    }
+
     // Disable remaining memory channels, if any.
-    for &channel in &lv.mem_channels[4..NUM_GP_CHANNELS] {
+    for &channel in &lv.mem_channels[2..] {
         yield_constr.constraint(filter * channel.used);
     }
+    yield_constr.constraint(filter * lv.partial_channel.used);
 
     // Stack constraints
     stack::eval_packed_one(
@@ -50,18 +62,22 @@ fn eval_packed_load<P: PackedField>(
     );
 }
 
+/// Circuit version for `eval_packed_load`.
+/// Evaluates constraints for MLOAD_GENERAL.
 fn eval_ext_circuit_load<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
+    // The opcode for MLOAD_GENERAL is 0xfb. If the operation is MLOAD_GENERAL, lv.opcode_bits[0] = 1.
     let mut filter = lv.op.m_op_general;
     filter = builder.mul_extension(filter, lv.opcode_bits[0]);
 
-    let (addr_context, addr_segment, addr_virtual) = get_addr(lv);
+    let (addr_context, addr_segment, addr_virtual) = get_addr_load(lv);
 
-    let load_channel = lv.mem_channels[3];
+    // Check that we are loading the correct value from the correct channel.
+    let load_channel = lv.mem_channels[1];
     {
         let constr = builder.mul_sub_extension(filter, load_channel.used, filter);
         yield_constr.constraint(builder, constr);
@@ -83,11 +99,26 @@ fn eval_ext_circuit_load<F: RichField + Extendable<D>, const D: usize>(
         yield_constr.constraint(builder, constr);
     }
 
+    // Constrain the new top of the stack.
+    for (&limb_loaded, &limb_new_top) in load_channel
+        .value
+        .iter()
+        .zip(nv.mem_channels[0].value.iter())
+    {
+        let diff = builder.sub_extension(limb_loaded, limb_new_top);
+        let constr = builder.mul_extension(filter, diff);
+        yield_constr.constraint(builder, constr);
+    }
+
     // Disable remaining memory channels, if any.
-    for &channel in &lv.mem_channels[4..NUM_GP_CHANNELS] {
+    for &channel in &lv.mem_channels[2..] {
         let constr = builder.mul_extension(filter, channel.used);
         yield_constr.constraint(builder, constr);
     }
+    {
+        let constr = builder.mul_extension(filter, lv.partial_channel.used);
+        yield_constr.constraint(builder, constr);
+    }
 
     // Stack constraints
     stack::eval_ext_circuit_one(
@@ -100,6 +131,7 @@ fn eval_ext_circuit_load<F: RichField + Extendable<D>, const D: usize>(
     );
 }
 
+/// Evaluates constraints for MSTORE_GENERAL.
 fn eval_packed_store<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
@@ -107,27 +139,25 @@ fn eval_packed_store<P: PackedField>(
 ) {
     let filter = lv.op.m_op_general * (lv.opcode_bits[0] - P::ONES);
 
-    let (addr_context, addr_segment, addr_virtual) = get_addr(lv);
+    let (addr_context, addr_segment, addr_virtual) = get_addr_store(lv);
+
+    // The value will be checked with the CTL.
+    let store_channel = lv.partial_channel;
 
-    let value_channel = lv.mem_channels[3];
-    let store_channel = lv.mem_channels[4];
     yield_constr.constraint(filter * (store_channel.used - P::ONES));
     yield_constr.constraint(filter * store_channel.is_read);
     yield_constr.constraint(filter * (store_channel.addr_context - addr_context));
     yield_constr.constraint(filter * (store_channel.addr_segment - addr_segment));
     yield_constr.constraint(filter * (store_channel.addr_virtual - addr_virtual));
-    for (value_limb, store_limb) in izip!(value_channel.value, store_channel.value) {
-        yield_constr.constraint(filter * (value_limb - store_limb));
-    }
 
     // Disable remaining memory channels, if any.
-    for &channel in &lv.mem_channels[5..] {
+    for &channel in &lv.mem_channels[2..] {
         yield_constr.constraint(filter * channel.used);
     }
 
     // Stack constraints.
     // Pops.
-    for i in 1..4 {
+    for i in 1..2 {
         let channel = lv.mem_channels[i];
 
         yield_constr.constraint(filter * (channel.used - P::ONES));
@@ -135,19 +165,21 @@ fn eval_packed_store<P: PackedField>(
 
         yield_constr.constraint(filter * (channel.addr_context - lv.context));
         yield_constr.constraint(
-            filter * (channel.addr_segment - P::Scalar::from_canonical_u64(Segment::Stack as u64)),
+            filter
+                * (channel.addr_segment
+                    - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
         );
         // Remember that the first read (`i == 1`) is for the second stack element at `stack[stack_len - 1]`.
         let addr_virtual = lv.stack_len - P::Scalar::from_canonical_usize(i + 1);
         yield_constr.constraint(filter * (channel.addr_virtual - addr_virtual));
     }
     // Constrain `stack_inv_aux`.
-    let len_diff = lv.stack_len - P::Scalar::from_canonical_usize(4);
+    let len_diff = lv.stack_len - P::Scalar::from_canonical_usize(2);
     yield_constr.constraint(
         lv.op.m_op_general
             * (len_diff * lv.general.stack().stack_inv - lv.general.stack().stack_inv_aux),
     );
-    // If stack_len != 4 and MSTORE, read new top of the stack in nv.mem_channels[0].
+    // If stack_len != 2 and MSTORE, read new top of the stack in nv.mem_channels[0].
     let top_read_channel = nv.mem_channels[0];
     let is_top_read = lv.general.stack().stack_inv_aux * (P::ONES - lv.opcode_bits[0]);
     // Constrain `stack_inv_aux_2`. It contains `stack_inv_aux * opcode_bits[0]`.
@@ -160,17 +192,19 @@ fn eval_packed_store<P: PackedField>(
     yield_constr.constraint_transition(
         new_filter
             * (top_read_channel.addr_segment
-                - P::Scalar::from_canonical_u64(Segment::Stack as u64)),
+                - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
     );
     let addr_virtual = nv.stack_len - P::ONES;
     yield_constr.constraint_transition(new_filter * (top_read_channel.addr_virtual - addr_virtual));
-    // If stack_len == 4 or MLOAD, disable the channel.
+    // If stack_len == 2 or MLOAD, disable the channel.
     yield_constr.constraint(
         lv.op.m_op_general * (lv.general.stack().stack_inv_aux - P::ONES) * top_read_channel.used,
     );
     yield_constr.constraint(lv.op.m_op_general * lv.opcode_bits[0] * top_read_channel.used);
 }
 
+/// Circuit version of `eval_packed_store`.
+/// Evaluates constraints for MSTORE_GENERAL.
 fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -180,10 +214,10 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
     let filter =
         builder.mul_sub_extension(lv.op.m_op_general, lv.opcode_bits[0], lv.op.m_op_general);
 
-    let (addr_context, addr_segment, addr_virtual) = get_addr(lv);
+    let (addr_context, addr_segment, addr_virtual) = get_addr_store(lv);
 
-    let value_channel = lv.mem_channels[3];
-    let store_channel = lv.mem_channels[4];
+    // The value will be checked with the CTL.
+    let store_channel = lv.partial_channel;
     {
         let constr = builder.mul_sub_extension(filter, store_channel.used, filter);
         yield_constr.constraint(builder, constr);
@@ -204,21 +238,16 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_extension(filter, diff);
         yield_constr.constraint(builder, constr);
     }
-    for (value_limb, store_limb) in izip!(value_channel.value, store_channel.value) {
-        let diff = builder.sub_extension(value_limb, store_limb);
-        let constr = builder.mul_extension(filter, diff);
-        yield_constr.constraint(builder, constr);
-    }
 
     // Disable remaining memory channels, if any.
-    for &channel in &lv.mem_channels[5..] {
+    for &channel in &lv.mem_channels[2..] {
         let constr = builder.mul_extension(filter, channel.used);
         yield_constr.constraint(builder, constr);
     }
 
     // Stack constraints
     // Pops.
-    for i in 1..4 {
+    for i in 1..2 {
         let channel = lv.mem_channels[i];
 
         {
@@ -237,7 +266,7 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
         {
             let diff = builder.add_const_extension(
                 channel.addr_segment,
-                -F::from_canonical_u64(Segment::Stack as u64),
+                -F::from_canonical_usize(Segment::Stack.unscale()),
             );
             let constr = builder.mul_extension(filter, diff);
             yield_constr.constraint(builder, constr);
@@ -251,7 +280,7 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
     }
     // Constrain `stack_inv_aux`.
     {
-        let len_diff = builder.add_const_extension(lv.stack_len, -F::from_canonical_usize(4));
+        let len_diff = builder.add_const_extension(lv.stack_len, -F::from_canonical_usize(2));
         let diff = builder.mul_sub_extension(
             len_diff,
             lv.general.stack().stack_inv,
@@ -260,11 +289,11 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_extension(lv.op.m_op_general, diff);
         yield_constr.constraint(builder, constr);
     }
-    // If stack_len != 4 and MSTORE, read new top of the stack in nv.mem_channels[0].
+    // If stack_len != 2 and MSTORE, read new top of the stack in nv.mem_channels[0].
     let top_read_channel = nv.mem_channels[0];
     let is_top_read = builder.mul_extension(lv.general.stack().stack_inv_aux, lv.opcode_bits[0]);
     let is_top_read = builder.sub_extension(lv.general.stack().stack_inv_aux, is_top_read);
-    // Constrain `stack_inv_aux_2`. It contains `stack_inv_aux * opcode_bits[0]`.
+    // Constrain `stack_inv_aux_2`. It contains `stack_inv_aux * (1 - opcode_bits[0])`.
     {
         let diff = builder.sub_extension(lv.general.stack().stack_inv_aux_2, is_top_read);
         let constr = builder.mul_extension(lv.op.m_op_general, diff);
@@ -287,7 +316,7 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
     {
         let diff = builder.add_const_extension(
             top_read_channel.addr_segment,
-            -F::from_canonical_u64(Segment::Stack as u64),
+            -F::from_canonical_usize(Segment::Stack.unscale()),
         );
         let constr = builder.mul_extension(new_filter, diff);
         yield_constr.constraint_transition(builder, constr);
@@ -298,7 +327,7 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_extension(new_filter, diff);
         yield_constr.constraint_transition(builder, constr);
     }
-    // If stack_len == 4 or MLOAD, disable the channel.
+    // If stack_len == 2 or MLOAD, disable the channel.
     {
         let diff = builder.mul_sub_extension(
             lv.op.m_op_general,
@@ -315,7 +344,8 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
     }
 }
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints for MLOAD_GENERAL and MSTORE_GENERAL.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -324,7 +354,9 @@ pub fn eval_packed<P: PackedField>(
     eval_packed_store(lv, nv, yield_constr);
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints for MLOAD_GENERAL and MSTORE_GENERAL.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
diff --git a/evm/src/cpu/mod.rs b/evm/src/cpu/mod.rs
index 0885f644bb..3d5124ba2b 100644
--- a/evm/src/cpu/mod.rs
+++ b/evm/src/cpu/mod.rs
@@ -1,4 +1,5 @@
-pub(crate) mod bootstrap_kernel;
+mod byte_unpacking;
+mod clock;
 pub(crate) mod columns;
 mod contextops;
 pub(crate) mod control_flow;
@@ -17,5 +18,4 @@ mod push0;
 mod shift;
 pub(crate) mod simple_logic;
 pub(crate) mod stack;
-pub(crate) mod stack_bounds;
 mod syscalls_exceptions;
diff --git a/evm/src/cpu/modfp254.rs b/evm/src/cpu/modfp254.rs
index eed497f5d3..95bab8d655 100644
--- a/evm/src/cpu/modfp254.rs
+++ b/evm/src/cpu/modfp254.rs
@@ -15,7 +15,8 @@ const P_LIMBS: [u32; 8] = [
     0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
 ];
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints to check the modulus in mem_channel[2].
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
@@ -31,7 +32,9 @@ pub fn eval_packed<P: PackedField>(
     }
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints to check the modulus in mem_channel[2].
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
diff --git a/evm/src/cpu/pc.rs b/evm/src/cpu/pc.rs
index 5271ad81aa..9635534e50 100644
--- a/evm/src/cpu/pc.rs
+++ b/evm/src/cpu/pc.rs
@@ -6,12 +6,14 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::CpuColumnsView;
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints to check that we are storing the correct PC.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let filter = lv.op.pc;
+    // `PUSH0`'s opcode is odd, while `PC`'s opcode is even.
+    let filter = lv.op.pc_push0 * (P::ONES - lv.opcode_bits[0]);
     let new_stack_top = nv.mem_channels[0].value;
     yield_constr.constraint(filter * (new_stack_top[0] - lv.program_counter));
     for &limb in &new_stack_top[1..] {
@@ -19,13 +21,18 @@ pub fn eval_packed<P: PackedField>(
     }
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version if `eval_packed`.
+/// Evaluates constraints to check that we are storing the correct PC.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let filter = lv.op.pc;
+    // `PUSH0`'s opcode is odd, while `PC`'s opcode is even.
+    let one = builder.one_extension();
+    let mut filter = builder.sub_extension(one, lv.opcode_bits[0]);
+    filter = builder.mul_extension(lv.op.pc_push0, filter);
     let new_stack_top = nv.mem_channels[0].value;
     {
         let diff = builder.sub_extension(new_stack_top[0], lv.program_counter);
diff --git a/evm/src/cpu/push0.rs b/evm/src/cpu/push0.rs
index d49446cc23..ed9f6c10f2 100644
--- a/evm/src/cpu/push0.rs
+++ b/evm/src/cpu/push0.rs
@@ -6,24 +6,29 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::CpuColumnsView;
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints to check that we are not pushing anything.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let filter = lv.op.push0;
+    // `PUSH0`'s opcode is odd, while `PC`'s opcode is even.
+    let filter = lv.op.pc_push0 * lv.opcode_bits[0];
     for limb in nv.mem_channels[0].value {
         yield_constr.constraint(filter * limb);
     }
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints to check that we are not pushing anything.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let filter = lv.op.push0;
+    // `PUSH0`'s opcode is odd, while `PC`'s opcode is even.
+    let filter = builder.mul_extension(lv.op.pc_push0, lv.opcode_bits[0]);
     for limb in nv.mem_channels[0].value {
         let constr = builder.mul_extension(filter, limb);
         yield_constr.constraint(builder, constr);
diff --git a/evm/src/cpu/shift.rs b/evm/src/cpu/shift.rs
index 0f92cbd20d..9e751421ff 100644
--- a/evm/src/cpu/shift.rs
+++ b/evm/src/cpu/shift.rs
@@ -9,6 +9,8 @@ use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::membus::NUM_GP_CHANNELS;
 use crate::memory::segments::Segment;
 
+/// Evaluates constraints for shift operations on the CPU side:
+/// the shifting factor is read from memory when displacement < 2^32.
 pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -22,7 +24,7 @@ pub(crate) fn eval_packed<P: PackedField>(
     // let val = lv.mem_channels[0];
     // let output = lv.mem_channels[NUM_GP_CHANNELS - 1];
 
-    let shift_table_segment = P::Scalar::from_canonical_u64(Segment::ShiftTable as u64);
+    let shift_table_segment = P::Scalar::from_canonical_usize(Segment::ShiftTable.unscale());
 
     // Only lookup the shifting factor when displacement is < 2^32.
     // two_exp.used is true (1) if the high limbs of the displacement are
@@ -46,7 +48,7 @@ pub(crate) fn eval_packed<P: PackedField>(
     yield_constr.constraint(is_shift * (two_exp.addr_virtual - displacement.value[0]));
 
     // Other channels must be unused
-    for chan in &lv.mem_channels[3..NUM_GP_CHANNELS - 1] {
+    for chan in &lv.mem_channels[3..NUM_GP_CHANNELS] {
         yield_constr.constraint(is_shift * chan.used); // channel is not used
     }
 
@@ -56,9 +58,12 @@ pub(crate) fn eval_packed<P: PackedField>(
     //
     // 1 -> 0  (value to be shifted is the same)
     // 2 -> 1  (two_exp becomes the multiplicand (resp. divisor))
-    // last -> last  (output is the same)
+    // next_0 -> next_0  (output is the same)
 }
 
+/// Circuit version.
+/// Evaluates constraints for shift operations on the CPU side:
+/// the shifting factor is read from memory when displacement < 2^32.
 pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -68,7 +73,7 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let displacement = lv.mem_channels[0];
     let two_exp = lv.mem_channels[2];
 
-    let shift_table_segment = F::from_canonical_u64(Segment::ShiftTable as u64);
+    let shift_table_segment = F::from_canonical_usize(Segment::ShiftTable.unscale());
 
     // Only lookup the shifting factor when displacement is < 2^32.
     // two_exp.used is true (1) if the high limbs of the displacement are
@@ -111,7 +116,7 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     yield_constr.constraint(builder, t);
 
     // Other channels must be unused
-    for chan in &lv.mem_channels[3..NUM_GP_CHANNELS - 1] {
+    for chan in &lv.mem_channels[3..NUM_GP_CHANNELS] {
         let t = builder.mul_extension(is_shift, chan.used);
         yield_constr.constraint(builder, t);
     }
diff --git a/evm/src/cpu/simple_logic/eq_iszero.rs b/evm/src/cpu/simple_logic/eq_iszero.rs
index 7be021caa6..fd811ae7f7 100644
--- a/evm/src/cpu/simple_logic/eq_iszero.rs
+++ b/evm/src/cpu/simple_logic/eq_iszero.rs
@@ -19,27 +19,20 @@ fn limbs(x: U256) -> [u32; 8] {
     }
     res
 }
-
-pub fn generate_pinv_diff<F: Field>(val0: U256, val1: U256, lv: &mut CpuColumnsView<F>) {
+/// Form `diff_pinv`.
+/// Let `diff = val0 - val1`. Consider `x[i] = diff[i]^-1` if `diff[i] != 0` and 0 otherwise.
+/// Then `diff @ x = num_unequal_limbs`, where `@` denotes the dot product. We set
+/// `diff_pinv = num_unequal_limbs^-1 * x` if `num_unequal_limbs != 0` and 0 otherwise. We have
+/// `diff @ diff_pinv = 1 - equal` as desired.
+pub(crate) fn generate_pinv_diff<F: Field>(val0: U256, val1: U256, lv: &mut CpuColumnsView<F>) {
     let val0_limbs = limbs(val0).map(F::from_canonical_u32);
     let val1_limbs = limbs(val1).map(F::from_canonical_u32);
 
     let num_unequal_limbs = izip!(val0_limbs, val1_limbs)
         .map(|(limb0, limb1)| (limb0 != limb1) as usize)
         .sum();
-    let equal = num_unequal_limbs == 0;
-
-    let output = &mut lv.mem_channels[2].value;
-    output[0] = F::from_bool(equal);
-    for limb in &mut output[1..] {
-        *limb = F::ZERO;
-    }
 
     // Form `diff_pinv`.
-    // Let `diff = val0 - val1`. Consider `x[i] = diff[i]^-1` if `diff[i] != 0` and 0 otherwise.
-    // Then `diff @ x = num_unequal_limbs`, where `@` denotes the dot product. We set
-    // `diff_pinv = num_unequal_limbs^-1 * x` if `num_unequal_limbs != 0` and 0 otherwise. We have
-    // `diff @ diff_pinv = 1 - equal` as desired.
     let logic = lv.general.logic_mut();
     let num_unequal_limbs_inv = F::from_canonical_usize(num_unequal_limbs)
         .try_inverse()
@@ -49,7 +42,8 @@ pub fn generate_pinv_diff<F: Field>(val0: U256, val1: U256, lv: &mut CpuColumnsV
     }
 }
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates the constraints for EQ and ISZERO.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -57,7 +51,7 @@ pub fn eval_packed<P: PackedField>(
     let logic = lv.general.logic();
     let input0 = lv.mem_channels[0].value;
     let input1 = lv.mem_channels[1].value;
-    let output = lv.mem_channels[2].value;
+    let output = nv.mem_channels[0].value;
 
     // EQ (0x14) and ISZERO (0x15) are differentiated by their first opcode bit.
     let eq_filter = lv.op.eq_iszero * (P::ONES - lv.opcode_bits[0]);
@@ -105,7 +99,9 @@ pub fn eval_packed<P: PackedField>(
     );
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates the constraints for EQ and ISZERO.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -117,7 +113,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let logic = lv.general.logic();
     let input0 = lv.mem_channels[0].value;
     let input1 = lv.mem_channels[1].value;
-    let output = lv.mem_channels[2].value;
+    let output = nv.mem_channels[0].value;
 
     // EQ (0x14) and ISZERO (0x15) are differentiated by their first opcode bit.
     let eq_filter = builder.mul_extension(lv.op.eq_iszero, lv.opcode_bits[0]);
diff --git a/evm/src/cpu/simple_logic/mod.rs b/evm/src/cpu/simple_logic/mod.rs
index 9b4e60b016..04f8bcc2da 100644
--- a/evm/src/cpu/simple_logic/mod.rs
+++ b/evm/src/cpu/simple_logic/mod.rs
@@ -9,21 +9,24 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::CpuColumnsView;
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints for NOT, EQ and ISZERO.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    not::eval_packed(lv, yield_constr);
+    not::eval_packed(lv, nv, yield_constr);
     eq_iszero::eval_packed(lv, nv, yield_constr);
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints for NOT, EQ and ISZERO.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    not::eval_ext_circuit(builder, lv, yield_constr);
+    not::eval_ext_circuit(builder, lv, nv, yield_constr);
     eq_iszero::eval_ext_circuit(builder, lv, nv, yield_constr);
 }
diff --git a/evm/src/cpu/simple_logic/not.rs b/evm/src/cpu/simple_logic/not.rs
index 0bfaa0b71a..3798606de3 100644
--- a/evm/src/cpu/simple_logic/not.rs
+++ b/evm/src/cpu/simple_logic/not.rs
@@ -6,34 +6,42 @@ use plonky2::iop::ext_target::ExtensionTarget;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::CpuColumnsView;
-use crate::cpu::membus::NUM_GP_CHANNELS;
+use crate::cpu::stack;
 
 const LIMB_SIZE: usize = 32;
 const ALL_1_LIMB: u64 = (1 << LIMB_SIZE) - 1;
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints for NOT.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
     // This is simple: just do output = 0xffffffff - input.
     let input = lv.mem_channels[0].value;
-    let output = lv.mem_channels[NUM_GP_CHANNELS - 1].value;
-    let filter = lv.op.not;
+    let output = nv.mem_channels[0].value;
+    let filter = lv.op.not_pop * lv.opcode_bits[0];
     for (input_limb, output_limb) in input.into_iter().zip(output) {
         yield_constr.constraint(
             filter * (output_limb + input_limb - P::Scalar::from_canonical_u64(ALL_1_LIMB)),
         );
     }
+
+    // Stack constraints.
+    stack::eval_packed_one(lv, nv, filter, stack::BASIC_UNARY_OP.unwrap(), yield_constr);
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints for NOT.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
     let input = lv.mem_channels[0].value;
-    let output = lv.mem_channels[NUM_GP_CHANNELS - 1].value;
-    let filter = lv.op.not;
+    let output = nv.mem_channels[0].value;
+    let filter = builder.mul_extension(lv.op.not_pop, lv.opcode_bits[0]);
     for (input_limb, output_limb) in input.into_iter().zip(output) {
         let constr = builder.add_extension(output_limb, input_limb);
         let constr = builder.arithmetic_extension(
@@ -45,4 +53,14 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         );
         yield_constr.constraint(builder, constr);
     }
+
+    // Stack constraints.
+    stack::eval_ext_circuit_one(
+        builder,
+        lv,
+        nv,
+        filter,
+        stack::BASIC_UNARY_OP.unwrap(),
+        yield_constr,
+    );
 }
diff --git a/evm/src/cpu/stack.rs b/evm/src/cpu/stack.rs
index db0c480d3d..87ca7ee1c4 100644
--- a/evm/src/cpu/stack.rs
+++ b/evm/src/cpu/stack.rs
@@ -1,4 +1,4 @@
-use std::cmp::max;
+use core::cmp::max;
 
 use itertools::izip;
 use plonky2::field::extension::Extendable;
@@ -13,49 +13,96 @@ use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::membus::NUM_GP_CHANNELS;
 use crate::memory::segments::Segment;
 
+pub(crate) const MAX_USER_STACK_SIZE: usize = 1024;
+
+// We check for stack overflows here. An overflow occurs when the stack length is 1025 in user mode,
+// which can happen after a non-kernel-only, non-popping, pushing instruction/syscall.
+// The check uses `stack_len_bounds_aux`, which is either 0 if next row's `stack_len` is 1025 or
+// next row is in kernel mode, or the inverse of `nv.stack_len - 1025` otherwise.
+pub(crate) const MIGHT_OVERFLOW: OpsColumnsView<bool> = OpsColumnsView {
+    binary_op: false,
+    ternary_op: false,
+    fp254_op: false,
+    eq_iszero: false,
+    logic_op: false,
+    not_pop: false,
+    shift: false,
+    jumpdest_keccak_general: false,
+    push_prover_input: true, // PROVER_INPUT doesn't require the check, but PUSH does.
+    jumps: false,
+    pc_push0: true,
+    dup_swap: true,
+    context_op: false,
+    m_op_32bytes: false,
+    exit_kernel: true, // Doesn't directly push, but the syscall it's returning from might.
+    m_op_general: false,
+    syscall: false,
+    exception: false,
+};
+
+/// Structure to represent opcodes stack behaviours:
+/// - number of pops
+/// - whether the opcode(s) push
+/// - whether unused channels should be disabled.
 #[derive(Clone, Copy)]
 pub(crate) struct StackBehavior {
     pub(crate) num_pops: usize,
     pub(crate) pushes: bool,
-    new_top_stack_channel: Option<usize>,
     disable_other_channels: bool,
 }
 
+/// `StackBehavior` for unary operations.
+pub(crate) const BASIC_UNARY_OP: Option<StackBehavior> = Some(StackBehavior {
+    num_pops: 1,
+    pushes: true,
+    disable_other_channels: true,
+});
+/// `StackBehavior` for binary operations.
 const BASIC_BINARY_OP: Option<StackBehavior> = Some(StackBehavior {
     num_pops: 2,
     pushes: true,
-    new_top_stack_channel: Some(NUM_GP_CHANNELS - 1),
     disable_other_channels: true,
 });
+/// `StackBehavior` for ternary operations.
 const BASIC_TERNARY_OP: Option<StackBehavior> = Some(StackBehavior {
     num_pops: 3,
     pushes: true,
-    new_top_stack_channel: Some(NUM_GP_CHANNELS - 1),
     disable_other_channels: true,
 });
+/// `StackBehavior` for JUMP.
 pub(crate) const JUMP_OP: Option<StackBehavior> = Some(StackBehavior {
     num_pops: 1,
     pushes: false,
-    new_top_stack_channel: None,
     disable_other_channels: false,
 });
+/// `StackBehavior` for JUMPI.
 pub(crate) const JUMPI_OP: Option<StackBehavior> = Some(StackBehavior {
     num_pops: 2,
     pushes: false,
-    new_top_stack_channel: None,
     disable_other_channels: false,
 });
-
+/// `StackBehavior` for MLOAD_GENERAL.
 pub(crate) const MLOAD_GENERAL_OP: Option<StackBehavior> = Some(StackBehavior {
-    num_pops: 3,
+    num_pops: 1,
     pushes: true,
-    new_top_stack_channel: None,
     disable_other_channels: false,
 });
 
+pub(crate) const KECCAK_GENERAL_OP: StackBehavior = StackBehavior {
+    num_pops: 2,
+    pushes: true,
+    disable_other_channels: true,
+};
+
+pub(crate) const JUMPDEST_OP: StackBehavior = StackBehavior {
+    num_pops: 0,
+    pushes: false,
+    disable_other_channels: true,
+};
+
 // AUDITORS: If the value below is `None`, then the operation must be manually checked to ensure
 // that every general-purpose memory channel is either disabled or has its read flag and address
-// propertly constrained. The same applies  when `disable_other_channels` is set to `false`,
+// properly constrained. The same applies  when `disable_other_channels` is set to `false`,
 // except the first `num_pops` and the last `pushes as usize` channels have their read flag and
 // address constrained automatically in this file.
 pub(crate) const STACK_BEHAVIORS: OpsColumnsView<Option<StackBehavior>> = OpsColumnsView {
@@ -64,105 +111,63 @@ pub(crate) const STACK_BEHAVIORS: OpsColumnsView<Option<StackBehavior>> = OpsCol
     fp254_op: BASIC_BINARY_OP,
     eq_iszero: None, // EQ is binary, IS_ZERO is unary.
     logic_op: BASIC_BINARY_OP,
-    not: Some(StackBehavior {
-        num_pops: 1,
-        pushes: true,
-        new_top_stack_channel: Some(NUM_GP_CHANNELS - 1),
-        disable_other_channels: true,
-    }),
+    not_pop: None,
     shift: Some(StackBehavior {
         num_pops: 2,
         pushes: true,
-        new_top_stack_channel: Some(NUM_GP_CHANNELS - 1),
         disable_other_channels: false,
     }),
-    keccak_general: Some(StackBehavior {
-        num_pops: 4,
-        pushes: true,
-        new_top_stack_channel: Some(NUM_GP_CHANNELS - 1),
-        disable_other_channels: true,
-    }),
-    prover_input: None, // TODO
-    pop: Some(StackBehavior {
-        num_pops: 1,
-        pushes: false,
-        new_top_stack_channel: None,
-        disable_other_channels: true,
-    }),
-    jumps: None, // Depends on whether it's a JUMP or a JUMPI.
-    pc: Some(StackBehavior {
+    jumpdest_keccak_general: None,
+    push_prover_input: Some(StackBehavior {
         num_pops: 0,
         pushes: true,
-        new_top_stack_channel: None,
-        disable_other_channels: true,
-    }),
-    jumpdest: Some(StackBehavior {
-        num_pops: 0,
-        pushes: false,
-        new_top_stack_channel: None,
         disable_other_channels: true,
     }),
-    push0: Some(StackBehavior {
+    jumps: None, // Depends on whether it's a JUMP or a JUMPI.
+    pc_push0: Some(StackBehavior {
         num_pops: 0,
         pushes: true,
-        new_top_stack_channel: None,
         disable_other_channels: true,
     }),
-    push: None, // TODO
     dup_swap: None,
-    get_context: Some(StackBehavior {
-        num_pops: 0,
-        pushes: true,
-        new_top_stack_channel: None,
-        disable_other_channels: true,
-    }),
-    set_context: None, // SET_CONTEXT is special since it involves the old and the new stack.
-    mload_32bytes: Some(StackBehavior {
-        num_pops: 4,
+    context_op: None,
+    m_op_32bytes: Some(StackBehavior {
+        num_pops: 2,
         pushes: true,
-        new_top_stack_channel: Some(4),
-        disable_other_channels: false,
-    }),
-    mstore_32bytes: Some(StackBehavior {
-        num_pops: 5,
-        pushes: false,
-        new_top_stack_channel: None,
         disable_other_channels: false,
     }),
     exit_kernel: Some(StackBehavior {
         num_pops: 1,
         pushes: false,
-        new_top_stack_channel: None,
         disable_other_channels: true,
     }),
     m_op_general: None,
     syscall: Some(StackBehavior {
         num_pops: 0,
         pushes: true,
-        new_top_stack_channel: None,
         disable_other_channels: false,
     }),
     exception: Some(StackBehavior {
         num_pops: 0,
         pushes: true,
-        new_top_stack_channel: None,
         disable_other_channels: false,
     }),
 };
 
+/// Stack behavior for EQ.
 pub(crate) const EQ_STACK_BEHAVIOR: Option<StackBehavior> = Some(StackBehavior {
     num_pops: 2,
     pushes: true,
-    new_top_stack_channel: Some(2),
     disable_other_channels: true,
 });
+/// Stack behavior for ISZERO.
 pub(crate) const IS_ZERO_STACK_BEHAVIOR: Option<StackBehavior> = Some(StackBehavior {
     num_pops: 1,
     pushes: true,
-    new_top_stack_channel: Some(2),
     disable_other_channels: true,
 });
 
+/// Evaluates constraints for one `StackBehavior`.
 pub(crate) fn eval_packed_one<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
@@ -181,13 +186,17 @@ pub(crate) fn eval_packed_one<P: PackedField>(
             yield_constr.constraint(filter * (channel.addr_context - lv.context));
             yield_constr.constraint(
                 filter
-                    * (channel.addr_segment - P::Scalar::from_canonical_u64(Segment::Stack as u64)),
+                    * (channel.addr_segment
+                        - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
             );
             // Remember that the first read (`i == 1`) is for the second stack element at `stack[stack_len - 1]`.
             let addr_virtual = lv.stack_len - P::Scalar::from_canonical_usize(i + 1);
             yield_constr.constraint(filter * (channel.addr_virtual - addr_virtual));
         }
 
+        // You can't have a write of the top of the stack, so you disable the corresponding flag.
+        yield_constr.constraint(filter * lv.partial_channel.used);
+
         // If you also push, you don't need to read the new top of the stack.
         // If you don't:
         // - if the stack isn't empty after the pops, you read the new top from an extra pop.
@@ -204,7 +213,8 @@ pub(crate) fn eval_packed_one<P: PackedField>(
             yield_constr.constraint_transition(new_filter * (channel.addr_context - nv.context));
             yield_constr.constraint_transition(
                 new_filter
-                    * (channel.addr_segment - P::Scalar::from_canonical_u64(Segment::Stack as u64)),
+                    * (channel.addr_segment
+                        - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
             );
             let addr_virtual = nv.stack_len - P::ONES;
             yield_constr.constraint_transition(new_filter * (channel.addr_virtual - addr_virtual));
@@ -222,20 +232,19 @@ pub(crate) fn eval_packed_one<P: PackedField>(
     else if stack_behavior.pushes {
         // If len > 0...
         let new_filter = lv.stack_len * filter;
-        // You write the previous top of the stack in memory, in the last channel.
-        let channel = lv.mem_channels[NUM_GP_CHANNELS - 1];
+        // You write the previous top of the stack in memory, in the partial channel.
+        // The value will be checked with the CTL.
+        let channel = lv.partial_channel;
         yield_constr.constraint(new_filter * (channel.used - P::ONES));
         yield_constr.constraint(new_filter * channel.is_read);
         yield_constr.constraint(new_filter * (channel.addr_context - lv.context));
         yield_constr.constraint(
             new_filter
-                * (channel.addr_segment - P::Scalar::from_canonical_u64(Segment::Stack as u64)),
+                * (channel.addr_segment
+                    - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
         );
         let addr_virtual = lv.stack_len - P::ONES;
         yield_constr.constraint(new_filter * (channel.addr_virtual - addr_virtual));
-        for (limb_ch, limb_top) in channel.value.iter().zip(lv.mem_channels[0].value.iter()) {
-            yield_constr.constraint(new_filter * (*limb_ch - *limb_top));
-        }
         // Else you disable the channel.
         yield_constr.constraint(
             filter
@@ -254,23 +263,14 @@ pub(crate) fn eval_packed_one<P: PackedField>(
         {
             yield_constr.constraint(filter * (*limb_old - *limb_new));
         }
-    }
 
-    // Maybe constrain next stack_top.
-    // These are transition constraints: they don't apply to the last row.
-    if let Some(next_top_ch) = stack_behavior.new_top_stack_channel {
-        for (limb_ch, limb_top) in lv.mem_channels[next_top_ch]
-            .value
-            .iter()
-            .zip(nv.mem_channels[0].value.iter())
-        {
-            yield_constr.constraint_transition(filter * (*limb_ch - *limb_top));
-        }
+        // You can't have a write of the top of the stack, so you disable the corresponding flag.
+        yield_constr.constraint(filter * lv.partial_channel.used);
     }
 
     // Unused channels
     if stack_behavior.disable_other_channels {
-        // The first channel contains (or not) the top od the stack and is constrained elsewhere.
+        // The first channel contains (or not) the top of the stack and is constrained elsewhere.
         for i in max(1, stack_behavior.num_pops)..NUM_GP_CHANNELS - (stack_behavior.pushes as usize)
         {
             let channel = lv.mem_channels[i];
@@ -284,18 +284,93 @@ pub(crate) fn eval_packed_one<P: PackedField>(
     yield_constr.constraint_transition(filter * (nv.stack_len - (lv.stack_len - num_pops + push)));
 }
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints for all opcodes' `StackBehavior`s.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    for (op, stack_behavior) in izip!(lv.op.into_iter(), STACK_BEHAVIORS.into_iter()) {
+    for (op, stack_behavior, might_overflow) in izip!(
+        lv.op.into_iter(),
+        STACK_BEHAVIORS.into_iter(),
+        MIGHT_OVERFLOW.into_iter()
+    ) {
         if let Some(stack_behavior) = stack_behavior {
             eval_packed_one(lv, nv, op, stack_behavior, yield_constr);
         }
+
+        if might_overflow {
+            // Check for stack overflow in the next row.
+            let diff = nv.stack_len - P::Scalar::from_canonical_usize(MAX_USER_STACK_SIZE + 1);
+            let lhs = diff * lv.general.stack().stack_len_bounds_aux;
+            let rhs = P::ONES - nv.is_kernel_mode;
+            yield_constr.constraint_transition(op * (lhs - rhs));
+        }
+    }
+
+    // Constrain stack for JUMPDEST.
+    let jumpdest_filter = lv.op.jumpdest_keccak_general * lv.opcode_bits[1];
+    eval_packed_one(lv, nv, jumpdest_filter, JUMPDEST_OP, yield_constr);
+
+    // Constrain stack for KECCAK_GENERAL.
+    let keccak_general_filter = lv.op.jumpdest_keccak_general * (P::ONES - lv.opcode_bits[1]);
+    eval_packed_one(
+        lv,
+        nv,
+        keccak_general_filter,
+        KECCAK_GENERAL_OP,
+        yield_constr,
+    );
+
+    // Stack constraints for POP.
+    // The only constraints POP has are stack constraints.
+    // Since POP and NOT are combined into one flag and they have
+    // different stack behaviors, POP needs special stack constraints.
+    // Constrain `stack_inv_aux`.
+    let len_diff = lv.stack_len - P::Scalar::ONES;
+    yield_constr.constraint(
+        lv.op.not_pop
+            * (len_diff * lv.general.stack().stack_inv - lv.general.stack().stack_inv_aux),
+    );
+
+    // If stack_len != 1 and POP, read new top of the stack in nv.mem_channels[0].
+    let top_read_channel = nv.mem_channels[0];
+    let is_top_read = lv.general.stack().stack_inv_aux * (P::ONES - lv.opcode_bits[0]);
+
+    // Constrain `stack_inv_aux_2`. It contains `stack_inv_aux * (1 - opcode_bits[0])`.
+    yield_constr.constraint(lv.op.not_pop * (lv.general.stack().stack_inv_aux_2 - is_top_read));
+    let new_filter = lv.op.not_pop * lv.general.stack().stack_inv_aux_2;
+    yield_constr.constraint_transition(new_filter * (top_read_channel.used - P::ONES));
+    yield_constr.constraint_transition(new_filter * (top_read_channel.is_read - P::ONES));
+    yield_constr.constraint_transition(new_filter * (top_read_channel.addr_context - nv.context));
+    yield_constr.constraint_transition(
+        new_filter
+            * (top_read_channel.addr_segment
+                - P::Scalar::from_canonical_usize(Segment::Stack.unscale())),
+    );
+    let addr_virtual = nv.stack_len - P::ONES;
+    yield_constr.constraint_transition(new_filter * (top_read_channel.addr_virtual - addr_virtual));
+    // If stack_len == 1 or NOT, disable the channel.
+    // If NOT or (len==1 and POP), then `stack_inv_aux_2` = 0.
+    yield_constr.constraint(
+        lv.op.not_pop * (lv.general.stack().stack_inv_aux_2 - P::ONES) * top_read_channel.used,
+    );
+
+    // Disable remaining memory channels.
+    for &channel in &lv.mem_channels[1..] {
+        yield_constr.constraint(lv.op.not_pop * (lv.opcode_bits[0] - P::ONES) * channel.used);
     }
+    yield_constr
+        .constraint(lv.op.not_pop * (lv.opcode_bits[0] - P::ONES) * lv.partial_channel.used);
+
+    // Constrain the new stack length for POP.
+    yield_constr.constraint_transition(
+        lv.op.not_pop * (lv.opcode_bits[0] - P::ONES) * (nv.stack_len - lv.stack_len + P::ONES),
+    );
 }
 
+/// Circuit version of `eval_packed_one`.
+/// Evaluates constraints for one `StackBehavior`.
 pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -325,7 +400,7 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
             {
                 let constr = builder.arithmetic_extension(
                     F::ONE,
-                    -F::from_canonical_u64(Segment::Stack as u64),
+                    -F::from_canonical_usize(Segment::Stack.unscale()),
                     filter,
                     channel.addr_segment,
                     filter,
@@ -346,6 +421,12 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
             }
         }
 
+        // You can't have a write of the top of the stack, so you disable the corresponding flag.
+        {
+            let constr = builder.mul_extension(filter, lv.partial_channel.used);
+            yield_constr.constraint(builder, constr);
+        }
+
         // If you also push, you don't need to read the new top of the stack.
         // If you don't:
         // - if the stack isn't empty after the pops, you read the new top from an extra pop.
@@ -376,7 +457,7 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
             {
                 let constr = builder.arithmetic_extension(
                     F::ONE,
-                    -F::from_canonical_u64(Segment::Stack as u64),
+                    -F::from_canonical_usize(Segment::Stack.unscale()),
                     new_filter,
                     channel.addr_segment,
                     new_filter,
@@ -410,7 +491,8 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
         // If len > 0...
         let new_filter = builder.mul_extension(lv.stack_len, filter);
         // You write the previous top of the stack in memory, in the last channel.
-        let channel = lv.mem_channels[NUM_GP_CHANNELS - 1];
+        // The value will be checked with the CTL
+        let channel = lv.partial_channel;
         {
             let constr = builder.mul_sub_extension(new_filter, channel.used, new_filter);
             yield_constr.constraint(builder, constr);
@@ -428,7 +510,7 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
         {
             let constr = builder.arithmetic_extension(
                 F::ONE,
-                -F::from_canonical_u64(Segment::Stack as u64),
+                -F::from_canonical_usize(Segment::Stack.unscale()),
                 new_filter,
                 channel.addr_segment,
                 new_filter,
@@ -440,11 +522,6 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
             let constr = builder.arithmetic_extension(F::ONE, F::ONE, new_filter, diff, new_filter);
             yield_constr.constraint(builder, constr);
         }
-        for (limb_ch, limb_top) in channel.value.iter().zip(lv.mem_channels[0].value.iter()) {
-            let diff = builder.sub_extension(*limb_ch, *limb_top);
-            let constr = builder.mul_extension(new_filter, diff);
-            yield_constr.constraint(builder, constr);
-        }
         // Else you disable the channel.
         {
             let diff = builder.mul_extension(lv.stack_len, lv.general.stack().stack_inv);
@@ -476,25 +553,17 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
                 yield_constr.constraint(builder, constr);
             }
         }
-    }
 
-    // Maybe constrain next stack_top.
-    // These are transition constraints: they don't apply to the last row.
-    if let Some(next_top_ch) = stack_behavior.new_top_stack_channel {
-        for (limb_ch, limb_top) in lv.mem_channels[next_top_ch]
-            .value
-            .iter()
-            .zip(nv.mem_channels[0].value.iter())
+        // You can't have a write of the top of the stack, so you disable the corresponding flag.
         {
-            let diff = builder.sub_extension(*limb_ch, *limb_top);
-            let constr = builder.mul_extension(filter, diff);
-            yield_constr.constraint_transition(builder, constr);
+            let constr = builder.mul_extension(filter, lv.partial_channel.used);
+            yield_constr.constraint(builder, constr);
         }
     }
 
     // Unused channels
     if stack_behavior.disable_other_channels {
-        // The first channel contains (or not) the top od the stack and is constrained elsewhere.
+        // The first channel contains (or not) the top of the stack and is constrained elsewhere.
         for i in max(1, stack_behavior.num_pops)..NUM_GP_CHANNELS - (stack_behavior.pushes as usize)
         {
             let channel = lv.mem_channels[i];
@@ -514,15 +583,136 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
     yield_constr.constraint_transition(builder, constr);
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints for all opcodes' `StackBehavior`s.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    for (op, stack_behavior) in izip!(lv.op.into_iter(), STACK_BEHAVIORS.into_iter()) {
+    for (op, stack_behavior, might_overflow) in izip!(
+        lv.op.into_iter(),
+        STACK_BEHAVIORS.into_iter(),
+        MIGHT_OVERFLOW.into_iter()
+    ) {
         if let Some(stack_behavior) = stack_behavior {
             eval_ext_circuit_one(builder, lv, nv, op, stack_behavior, yield_constr);
         }
+
+        if might_overflow {
+            // Check for stack overflow in the next row.
+            let diff = builder.add_const_extension(
+                nv.stack_len,
+                -F::from_canonical_usize(MAX_USER_STACK_SIZE + 1),
+            );
+            let prod = builder.mul_add_extension(
+                diff,
+                lv.general.stack().stack_len_bounds_aux,
+                nv.is_kernel_mode,
+            );
+            let rhs = builder.add_const_extension(prod, -F::ONE);
+            let constr = builder.mul_extension(op, rhs);
+            yield_constr.constraint_transition(builder, constr);
+        }
+    }
+
+    // Constrain stack for JUMPDEST.
+    let jumpdest_filter = builder.mul_extension(lv.op.jumpdest_keccak_general, lv.opcode_bits[1]);
+    eval_ext_circuit_one(builder, lv, nv, jumpdest_filter, JUMPDEST_OP, yield_constr);
+
+    // Constrain stack for KECCAK_GENERAL.
+    let one = builder.one_extension();
+    let mut keccak_general_filter = builder.sub_extension(one, lv.opcode_bits[1]);
+    keccak_general_filter =
+        builder.mul_extension(lv.op.jumpdest_keccak_general, keccak_general_filter);
+    eval_ext_circuit_one(
+        builder,
+        lv,
+        nv,
+        keccak_general_filter,
+        KECCAK_GENERAL_OP,
+        yield_constr,
+    );
+
+    // Stack constraints for POP.
+    // The only constraints POP has are stack constraints.
+    // Since POP and NOT are combined into one flag and they have
+    // different stack behaviors, POP needs special stack constraints.
+    // Constrain `stack_inv_aux`.
+    {
+        let len_diff = builder.add_const_extension(lv.stack_len, F::NEG_ONE);
+        let diff = builder.mul_sub_extension(
+            len_diff,
+            lv.general.stack().stack_inv,
+            lv.general.stack().stack_inv_aux,
+        );
+        let constr = builder.mul_extension(lv.op.not_pop, diff);
+        yield_constr.constraint(builder, constr);
+    }
+    // If stack_len != 4 and MSTORE, read new top of the stack in nv.mem_channels[0].
+    let top_read_channel = nv.mem_channels[0];
+    let is_top_read = builder.mul_extension(lv.general.stack().stack_inv_aux, lv.opcode_bits[0]);
+    let is_top_read = builder.sub_extension(lv.general.stack().stack_inv_aux, is_top_read);
+    // Constrain `stack_inv_aux_2`. It contains `stack_inv_aux * opcode_bits[0]`.
+    {
+        let diff = builder.sub_extension(lv.general.stack().stack_inv_aux_2, is_top_read);
+        let constr = builder.mul_extension(lv.op.not_pop, diff);
+        yield_constr.constraint(builder, constr);
     }
+    let new_filter = builder.mul_extension(lv.op.not_pop, lv.general.stack().stack_inv_aux_2);
+    {
+        let constr = builder.mul_sub_extension(new_filter, top_read_channel.used, new_filter);
+        yield_constr.constraint_transition(builder, constr);
+    }
+    {
+        let constr = builder.mul_sub_extension(new_filter, top_read_channel.is_read, new_filter);
+        yield_constr.constraint_transition(builder, constr);
+    }
+    {
+        let diff = builder.sub_extension(top_read_channel.addr_context, nv.context);
+        let constr = builder.mul_extension(new_filter, diff);
+        yield_constr.constraint_transition(builder, constr);
+    }
+    {
+        let diff = builder.add_const_extension(
+            top_read_channel.addr_segment,
+            -F::from_canonical_usize(Segment::Stack.unscale()),
+        );
+        let constr = builder.mul_extension(new_filter, diff);
+        yield_constr.constraint_transition(builder, constr);
+    }
+    {
+        let addr_virtual = builder.add_const_extension(nv.stack_len, -F::ONE);
+        let diff = builder.sub_extension(top_read_channel.addr_virtual, addr_virtual);
+        let constr = builder.mul_extension(new_filter, diff);
+        yield_constr.constraint_transition(builder, constr);
+    }
+    // If stack_len == 1 or NOT, disable the channel.
+    {
+        let diff = builder.mul_sub_extension(
+            lv.op.not_pop,
+            lv.general.stack().stack_inv_aux_2,
+            lv.op.not_pop,
+        );
+        let constr = builder.mul_extension(diff, top_read_channel.used);
+        yield_constr.constraint(builder, constr);
+    }
+
+    // Disable remaining memory channels.
+    let filter = builder.mul_sub_extension(lv.op.not_pop, lv.opcode_bits[0], lv.op.not_pop);
+    for &channel in &lv.mem_channels[1..] {
+        let constr = builder.mul_extension(filter, channel.used);
+        yield_constr.constraint(builder, constr);
+    }
+    {
+        let constr = builder.mul_extension(filter, lv.partial_channel.used);
+        yield_constr.constraint(builder, constr);
+    }
+
+    // Constrain the new stack length for POP.
+    let diff = builder.sub_extension(nv.stack_len, lv.stack_len);
+    let mut constr = builder.add_const_extension(diff, F::ONES);
+    constr = builder.mul_extension(filter, constr);
+    yield_constr.constraint_transition(builder, constr);
 }
diff --git a/evm/src/cpu/stack_bounds.rs b/evm/src/cpu/stack_bounds.rs
deleted file mode 100644
index e66e6686b5..0000000000
--- a/evm/src/cpu/stack_bounds.rs
+++ /dev/null
@@ -1,60 +0,0 @@
-//! Checks for stack overflow.
-//!
-//! The constraints defined herein validate that stack overflow did not occur. For example, if `dup`
-//! is set but the copy would overflow, these constraints would make the proof unverifiable.
-//!
-//! Faults are handled under a separate operation flag, `exception` , which traps to the kernel. The
-//! kernel then handles the exception. However, before it may do so, it must verify in software that
-//! an exception did in fact occur (i.e. the trap was warranted) and `PANIC` otherwise; this
-//! prevents the prover from faking an exception on a valid operation.
-
-use plonky2::field::extension::Extendable;
-use plonky2::field::packed::PackedField;
-use plonky2::field::types::Field;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
-
-use super::columns::COL_MAP;
-use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::cpu::columns::CpuColumnsView;
-
-pub const MAX_USER_STACK_SIZE: usize = 1024;
-
-pub fn eval_packed<P: PackedField>(
-    lv: &CpuColumnsView<P>,
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    // If we're in user mode, ensure that the stack length is not 1025. Note that a stack length of
-    // 1024 is valid. 1025 means we've gone one over, which is necessary for overflow, as an EVM
-    // opcode increases the stack length by at most one.
-
-    let filter: P = COL_MAP.op.iter().map(|&col_i| lv[col_i]).sum();
-    let diff = lv.stack_len - P::Scalar::from_canonical_usize(MAX_USER_STACK_SIZE + 1);
-    let lhs = diff * lv.stack_len_bounds_aux;
-    let rhs = P::ONES - lv.is_kernel_mode;
-
-    yield_constr.constraint(filter * (lhs - rhs));
-}
-
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
-    lv: &CpuColumnsView<ExtensionTarget<D>>,
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    // If we're in user mode, ensure that the stack length is not 1025. Note that a stack length of
-    // 1024 is valid. 1025 means we've gone one over, which is necessary for overflow, as an EVM
-    // opcode increases the stack length by at most one.
-
-    let filter = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| lv[col_i]));
-
-    let lhs = builder.arithmetic_extension(
-        F::ONE,
-        -F::from_canonical_usize(MAX_USER_STACK_SIZE + 1),
-        lv.stack_len,
-        lv.stack_len_bounds_aux,
-        lv.stack_len_bounds_aux,
-    );
-    let constr = builder.add_extension(lhs, lv.is_kernel_mode);
-    let constr = builder.mul_sub_extension(filter, constr, filter);
-    yield_constr.constraint(builder, constr);
-}
diff --git a/evm/src/cpu/syscalls_exceptions.rs b/evm/src/cpu/syscalls_exceptions.rs
index 1437fba02b..1dfdb8fa2c 100644
--- a/evm/src/cpu/syscalls_exceptions.rs
+++ b/evm/src/cpu/syscalls_exceptions.rs
@@ -7,7 +7,6 @@ use plonky2::field::packed::PackedField;
 use plonky2::field::types::Field;
 use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
-use static_assertions::const_assert;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::CpuColumnsView;
@@ -17,9 +16,9 @@ use crate::memory::segments::Segment;
 
 // Copy the constant but make it `usize`.
 const BYTES_PER_OFFSET: usize = crate::cpu::kernel::assembler::BYTES_PER_OFFSET as usize;
-const_assert!(BYTES_PER_OFFSET < NUM_GP_CHANNELS); // Reserve one channel for stack push
 
-pub fn eval_packed<P: PackedField>(
+/// Evaluates constraints for syscalls and exceptions.
+pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -28,6 +27,12 @@ pub fn eval_packed<P: PackedField>(
     let filter_exception = lv.op.exception;
     let total_filter = filter_syscall + filter_exception;
 
+    // First, constrain filters to be boolean.
+    // Ensuring they are mutually exclusive is done in other modules
+    // through the `is_cpu_cycle` variable.
+    yield_constr.constraint(filter_syscall * (filter_syscall - P::ONES));
+    yield_constr.constraint(filter_exception * (filter_exception - P::ONES));
+
     // If exception, ensure we are not in kernel mode
     yield_constr.constraint(filter_exception * lv.is_kernel_mode);
 
@@ -44,7 +49,7 @@ pub fn eval_packed<P: PackedField>(
     }
 
     // Look up the handler in memory
-    let code_segment = P::Scalar::from_canonical_usize(Segment::Code as usize);
+    let code_segment = P::Scalar::from_canonical_usize(Segment::Code.unscale());
 
     let opcode: P = lv
         .opcode_bits
@@ -64,43 +69,40 @@ pub fn eval_packed<P: PackedField>(
     let exc_handler_addr_start =
         exc_jumptable_start + exc_code * P::Scalar::from_canonical_usize(BYTES_PER_OFFSET);
 
-    for (i, channel) in lv.mem_channels[1..BYTES_PER_OFFSET + 1].iter().enumerate() {
-        yield_constr.constraint(total_filter * (channel.used - P::ONES));
-        yield_constr.constraint(total_filter * (channel.is_read - P::ONES));
+    let jumpdest_channel = lv.mem_channels[1];
+
+    // Set `used` and `is_read`.
+    // The channel is not used: the reads will be done with the byte packing CTL.
+    yield_constr.constraint(total_filter * (jumpdest_channel.used));
+    yield_constr.constraint(total_filter * (jumpdest_channel.is_read - P::ONES));
 
-        // Set kernel context and code segment
-        yield_constr.constraint(total_filter * channel.addr_context);
-        yield_constr.constraint(total_filter * (channel.addr_segment - code_segment));
+    // Set kernel context and code segment
+    yield_constr.constraint(total_filter * jumpdest_channel.addr_context);
+    yield_constr.constraint(total_filter * (jumpdest_channel.addr_segment - code_segment));
 
-        // Set address, using a separate channel for each of the `BYTES_PER_OFFSET` limbs.
-        let limb_address_syscall = opcode_handler_addr_start + P::Scalar::from_canonical_usize(i);
-        let limb_address_exception = exc_handler_addr_start + P::Scalar::from_canonical_usize(i);
+    // Set address.
+    yield_constr
+        .constraint(filter_syscall * (jumpdest_channel.addr_virtual - opcode_handler_addr_start));
+    yield_constr
+        .constraint(filter_exception * (jumpdest_channel.addr_virtual - exc_handler_addr_start));
 
-        yield_constr.constraint(filter_syscall * (channel.addr_virtual - limb_address_syscall));
-        yield_constr.constraint(filter_exception * (channel.addr_virtual - limb_address_exception));
+    // Set higher limbs to zero.
+    for &limb in &jumpdest_channel.value[1..] {
+        yield_constr.constraint(total_filter * limb);
     }
 
-    // Disable unused channels (the last channel is used to push to the stack)
-    for channel in &lv.mem_channels[BYTES_PER_OFFSET + 1..NUM_GP_CHANNELS - 1] {
+    // Disable unused channels
+    for channel in &lv.mem_channels[2..NUM_GP_CHANNELS] {
         yield_constr.constraint(total_filter * channel.used);
     }
 
     // Set program counter to the handler address
-    // The addresses are big-endian in memory
-    let target = lv.mem_channels[1..BYTES_PER_OFFSET + 1]
-        .iter()
-        .map(|channel| channel.value[0])
-        .fold(P::ZEROS, |cumul, limb| {
-            cumul * P::Scalar::from_canonical_u64(256) + limb
-        });
-    yield_constr.constraint_transition(total_filter * (nv.program_counter - target));
+    yield_constr
+        .constraint_transition(total_filter * (nv.program_counter - jumpdest_channel.value[0]));
     // Set kernel mode
     yield_constr.constraint_transition(total_filter * (nv.is_kernel_mode - P::ONES));
-    // Maintain current context
-    yield_constr.constraint_transition(total_filter * (nv.context - lv.context));
     // Reset gas counter to zero.
-    yield_constr.constraint_transition(total_filter * nv.gas[0]);
-    yield_constr.constraint_transition(total_filter * nv.gas[1]);
+    yield_constr.constraint_transition(total_filter * nv.gas);
 
     let output = nv.mem_channels[0].value;
     // New top of the stack: current PC + 1 (limb 0), kernel flag (limb 1), gas counter (limbs 6 and 7).
@@ -108,9 +110,8 @@ pub fn eval_packed<P: PackedField>(
     yield_constr.constraint(filter_exception * (output[0] - lv.program_counter));
     // Check the kernel mode, for syscalls only
     yield_constr.constraint(filter_syscall * (output[1] - lv.is_kernel_mode));
-    // TODO: Range check `output[6] and output[7]`.
-    yield_constr.constraint(total_filter * (output[6] - lv.gas[0]));
-    yield_constr.constraint(total_filter * (output[7] - lv.gas[1]));
+    yield_constr.constraint(total_filter * (output[6] - lv.gas));
+    yield_constr.constraint(total_filter * output[7]); // High limb of gas is zero.
 
     // Zero the rest of that register
     // output[1] is 0 for exceptions, but not for syscalls
@@ -120,7 +121,9 @@ pub fn eval_packed<P: PackedField>(
     }
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `eval_packed`.
+/// Evaluates constraints for syscalls and exceptions.
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     nv: &CpuColumnsView<ExtensionTarget<D>>,
@@ -130,6 +133,14 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let filter_exception = lv.op.exception;
     let total_filter = builder.add_extension(filter_syscall, filter_exception);
 
+    // First, constrain filters to be boolean.
+    // Ensuring they are mutually exclusive is done in other modules
+    // through the `is_cpu_cycle` variable.
+    let constr = builder.mul_sub_extension(filter_syscall, filter_syscall, filter_syscall);
+    yield_constr.constraint(builder, constr);
+    let constr = builder.mul_sub_extension(filter_exception, filter_exception, filter_exception);
+    yield_constr.constraint(builder, constr);
+
     // Ensure that, if exception, we are not in kernel mode
     let constr = builder.mul_extension(filter_exception, lv.is_kernel_mode);
     yield_constr.constraint(builder, constr);
@@ -151,7 +162,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     }
 
     // Look up the handler in memory
-    let code_segment = F::from_canonical_usize(Segment::Code as usize);
+    let code_segment = F::from_canonical_usize(Segment::Code.unscale());
 
     let opcode = lv
         .opcode_bits
@@ -181,60 +192,58 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         exc_jumptable_start,
     );
 
-    for (i, channel) in lv.mem_channels[1..BYTES_PER_OFFSET + 1].iter().enumerate() {
-        {
-            let constr = builder.mul_sub_extension(total_filter, channel.used, total_filter);
-            yield_constr.constraint(builder, constr);
-        }
-        {
-            let constr = builder.mul_sub_extension(total_filter, channel.is_read, total_filter);
-            yield_constr.constraint(builder, constr);
-        }
-
-        // Set kernel context and code segment
-        {
-            let constr = builder.mul_extension(total_filter, channel.addr_context);
-            yield_constr.constraint(builder, constr);
-        }
-        {
-            let constr = builder.arithmetic_extension(
-                F::ONE,
-                -code_segment,
-                total_filter,
-                channel.addr_segment,
-                total_filter,
-            );
-            yield_constr.constraint(builder, constr);
-        }
-
-        // Set address, using a separate channel for each of the `BYTES_PER_OFFSET` limbs.
-        {
-            let diff_syscall =
-                builder.sub_extension(channel.addr_virtual, opcode_handler_addr_start);
-            let constr = builder.arithmetic_extension(
-                F::ONE,
-                -F::from_canonical_usize(i),
-                filter_syscall,
-                diff_syscall,
-                filter_syscall,
-            );
-            yield_constr.constraint(builder, constr);
-
-            let diff_exception =
-                builder.sub_extension(channel.addr_virtual, exc_handler_addr_start);
-            let constr = builder.arithmetic_extension(
-                F::ONE,
-                -F::from_canonical_usize(i),
-                filter_exception,
-                diff_exception,
-                filter_exception,
-            );
-            yield_constr.constraint(builder, constr);
-        }
+    let jumpdest_channel = lv.mem_channels[1];
+
+    // Set `used` and `is_read`.
+    // The channel is not used: the reads will be done with the byte packing CTL.
+    {
+        let constr = builder.mul_extension(total_filter, jumpdest_channel.used);
+        yield_constr.constraint(builder, constr);
+    }
+    {
+        let constr =
+            builder.mul_sub_extension(total_filter, jumpdest_channel.is_read, total_filter);
+        yield_constr.constraint(builder, constr);
     }
 
-    // Disable unused channels (the last channel is used to push to the stack)
-    for channel in &lv.mem_channels[BYTES_PER_OFFSET + 1..NUM_GP_CHANNELS - 1] {
+    // Set kernel context and code segment
+    {
+        let constr = builder.mul_extension(total_filter, jumpdest_channel.addr_context);
+        yield_constr.constraint(builder, constr);
+    }
+    {
+        let constr = builder.arithmetic_extension(
+            F::ONE,
+            -code_segment,
+            total_filter,
+            jumpdest_channel.addr_segment,
+            total_filter,
+        );
+        yield_constr.constraint(builder, constr);
+    }
+
+    // Set address.
+    {
+        let diff_syscall =
+            builder.sub_extension(jumpdest_channel.addr_virtual, opcode_handler_addr_start);
+        let constr = builder.mul_extension(filter_syscall, diff_syscall);
+        yield_constr.constraint(builder, constr);
+    }
+    {
+        let diff_exception =
+            builder.sub_extension(jumpdest_channel.addr_virtual, exc_handler_addr_start);
+        let constr = builder.mul_extension(filter_exception, diff_exception);
+        yield_constr.constraint(builder, constr);
+    }
+
+    // Set higher limbs to zero.
+    for &limb in &jumpdest_channel.value[1..] {
+        let constr = builder.mul_extension(total_filter, limb);
+        yield_constr.constraint(builder, constr);
+    }
+
+    // Disable unused channels
+    for channel in &lv.mem_channels[2..NUM_GP_CHANNELS] {
         let constr = builder.mul_extension(total_filter, channel.used);
         yield_constr.constraint(builder, constr);
     }
@@ -242,13 +251,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     // Set program counter to the handler address
     // The addresses are big-endian in memory
     {
-        let target = lv.mem_channels[1..BYTES_PER_OFFSET + 1]
-            .iter()
-            .map(|channel| channel.value[0])
-            .fold(builder.zero_extension(), |cumul, limb| {
-                builder.mul_const_add_extension(F::from_canonical_u64(256), cumul, limb)
-            });
-        let diff = builder.sub_extension(nv.program_counter, target);
+        let diff = builder.sub_extension(nv.program_counter, jumpdest_channel.value[0]);
         let constr = builder.mul_extension(total_filter, diff);
         yield_constr.constraint_transition(builder, constr);
     }
@@ -257,17 +260,9 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_sub_extension(total_filter, nv.is_kernel_mode, total_filter);
         yield_constr.constraint_transition(builder, constr);
     }
-    // Maintain current context
-    {
-        let diff = builder.sub_extension(nv.context, lv.context);
-        let constr = builder.mul_extension(total_filter, diff);
-        yield_constr.constraint_transition(builder, constr);
-    }
     // Reset gas counter to zero.
     {
-        let constr = builder.mul_extension(total_filter, nv.gas[0]);
-        yield_constr.constraint_transition(builder, constr);
-        let constr = builder.mul_extension(total_filter, nv.gas[1]);
+        let constr = builder.mul_extension(total_filter, nv.gas);
         yield_constr.constraint_transition(builder, constr);
     }
 
@@ -292,15 +287,14 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_extension(filter_syscall, diff);
         yield_constr.constraint(builder, constr);
     }
-    // TODO: Range check `output[6]` and `output[7].
     {
-        let diff = builder.sub_extension(output[6], lv.gas[0]);
+        let diff = builder.sub_extension(output[6], lv.gas);
         let constr = builder.mul_extension(total_filter, diff);
         yield_constr.constraint(builder, constr);
     }
     {
-        let diff = builder.sub_extension(output[7], lv.gas[1]);
-        let constr = builder.mul_extension(total_filter, diff);
+        // High limb of gas is zero.
+        let constr = builder.mul_extension(total_filter, output[7]);
         yield_constr.constraint(builder, constr);
     }
 
diff --git a/evm/src/cross_table_lookup.rs b/evm/src/cross_table_lookup.rs
index 621403f912..359b5309e8 100644
--- a/evm/src/cross_table_lookup.rs
+++ b/evm/src/cross_table_lookup.rs
@@ -1,6 +1,34 @@
-use std::borrow::Borrow;
-use std::fmt::Debug;
-use std::iter::repeat;
+//! This crate provides support for cross-table lookups.
+//!
+//! If a STARK S_1 calls an operation that is carried out by another STARK S_2,
+//! S_1 provides the inputs to S_2 and reads the output from S_1. To ensure that
+//! the operation was correctly carried out, we must check that the provided inputs
+//! and outputs are correctly read. Cross-table lookups carry out that check.
+//!
+//! To achieve this, smaller CTL tables are created on both sides: looking and looked tables.
+//! In our example, we create a table S_1' comprised of columns -- or linear combinations
+//! of columns -- of S_1, and rows that call operations carried out in S_2. We also create a
+//! table S_2' comprised of columns -- or linear combinations od columns -- of S_2 and rows
+//! that carry out the operations needed by other STARKs. Then, S_1' is a looking table for
+//! the looked S_2', since we want to check that the operation outputs in S_1' are indeeed in S_2'.
+//! Furthermore, the concatenation of all tables looking into S_2' must be equal to S_2'.
+//!
+//! To achieve this, we construct, for each table, a permutation polynomial Z(x).
+//! Z(x) is computed as the product of all its column combinations.
+//! To check it was correctly constructed, we check:
+//! - Z(gw) = Z(w) * combine(w) where combine(w) is the column combination at point w.
+//! - Z(g^(n-1)) = combine(1).
+//! - The verifier also checks that the product of looking table Z polynomials is equal
+//! to the associated looked table Z polynomial.
+//! Note that the first two checks are written that way because Z polynomials are computed
+//! upside down for convenience.
+//!
+//! Additionally, we support cross-table lookups over two rows. The permutation principle
+//! is similar, but we provide not only `local_values` but also `next_values` -- corresponding to
+//! the current and next row values -- when computing the linear combinations.
+
+use core::cmp::min;
+use core::fmt::Debug;
 
 use anyhow::{ensure, Result};
 use itertools::Itertools;
@@ -14,265 +42,57 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::iop::target::Target;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::config::{AlgebraicHasher, GenericConfig, Hasher};
-use plonky2::plonk::plonk_common::{
-    reduce_with_powers, reduce_with_powers_circuit, reduce_with_powers_ext_circuit,
-};
+use plonky2::util::ceil_div_usize;
 use plonky2::util::serialization::{Buffer, IoResult, Read, Write};
 
-use crate::all_stark::{Table, NUM_TABLES};
 use crate::config::StarkConfig;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::evaluation_frame::StarkEvaluationFrame;
+use crate::lookup::{
+    eval_helper_columns, eval_helper_columns_circuit, get_helper_cols, Column, ColumnFilter,
+    Filter, GrandProductChallenge,
+};
 use crate::proof::{StarkProofTarget, StarkProofWithMetadata};
 use crate::stark::Stark;
 
-/// Represent a linear combination of columns.
-#[derive(Clone, Debug)]
-pub struct Column<F: Field> {
-    linear_combination: Vec<(usize, F)>,
-    next_row_linear_combination: Vec<(usize, F)>,
-    constant: F,
-}
-
-impl<F: Field> Column<F> {
-    pub fn single(c: usize) -> Self {
-        Self {
-            linear_combination: vec![(c, F::ONE)],
-            next_row_linear_combination: vec![],
-            constant: F::ZERO,
-        }
-    }
-
-    pub fn singles<I: IntoIterator<Item = impl Borrow<usize>>>(
-        cs: I,
-    ) -> impl Iterator<Item = Self> {
-        cs.into_iter().map(|c| Self::single(*c.borrow()))
-    }
-
-    pub fn single_next_row(c: usize) -> Self {
-        Self {
-            linear_combination: vec![],
-            next_row_linear_combination: vec![(c, F::ONE)],
-            constant: F::ZERO,
-        }
-    }
-
-    pub fn singles_next_row<I: IntoIterator<Item = impl Borrow<usize>>>(
-        cs: I,
-    ) -> impl Iterator<Item = Self> {
-        cs.into_iter().map(|c| Self::single_next_row(*c.borrow()))
-    }
-
-    pub fn constant(constant: F) -> Self {
-        Self {
-            linear_combination: vec![],
-            next_row_linear_combination: vec![],
-            constant,
-        }
-    }
-
-    pub fn zero() -> Self {
-        Self::constant(F::ZERO)
-    }
-
-    pub fn one() -> Self {
-        Self::constant(F::ONE)
-    }
-
-    pub fn linear_combination_with_constant<I: IntoIterator<Item = (usize, F)>>(
-        iter: I,
-        constant: F,
-    ) -> Self {
-        let v = iter.into_iter().collect::<Vec<_>>();
-        assert!(!v.is_empty());
-        debug_assert_eq!(
-            v.iter().map(|(c, _)| c).unique().count(),
-            v.len(),
-            "Duplicate columns."
-        );
-        Self {
-            linear_combination: v,
-            next_row_linear_combination: vec![],
-            constant,
-        }
-    }
-
-    pub fn linear_combination_and_next_row_with_constant<I: IntoIterator<Item = (usize, F)>>(
-        iter: I,
-        next_row_iter: I,
-        constant: F,
-    ) -> Self {
-        let v = iter.into_iter().collect::<Vec<_>>();
-        let next_row_v = next_row_iter.into_iter().collect::<Vec<_>>();
-
-        assert!(!v.is_empty() || !next_row_v.is_empty());
-        debug_assert_eq!(
-            v.iter().map(|(c, _)| c).unique().count(),
-            v.len(),
-            "Duplicate columns."
-        );
-        debug_assert_eq!(
-            next_row_v.iter().map(|(c, _)| c).unique().count(),
-            next_row_v.len(),
-            "Duplicate columns."
-        );
-
-        Self {
-            linear_combination: v,
-            next_row_linear_combination: next_row_v,
-            constant,
-        }
-    }
-
-    pub fn linear_combination<I: IntoIterator<Item = (usize, F)>>(iter: I) -> Self {
-        Self::linear_combination_with_constant(iter, F::ZERO)
-    }
-
-    pub fn le_bits<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
-        Self::linear_combination(cs.into_iter().map(|c| *c.borrow()).zip(F::TWO.powers()))
-    }
-
-    pub fn le_bytes<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
-        Self::linear_combination(
-            cs.into_iter()
-                .map(|c| *c.borrow())
-                .zip(F::from_canonical_u16(256).powers()),
-        )
-    }
-
-    pub fn sum<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
-        Self::linear_combination(cs.into_iter().map(|c| *c.borrow()).zip(repeat(F::ONE)))
-    }
-
-    pub fn eval<FE, P, const D: usize>(&self, v: &[P]) -> P
-    where
-        FE: FieldExtension<D, BaseField = F>,
-        P: PackedField<Scalar = FE>,
-    {
-        self.linear_combination
-            .iter()
-            .map(|&(c, f)| v[c] * FE::from_basefield(f))
-            .sum::<P>()
-            + FE::from_basefield(self.constant)
-    }
-
-    pub fn eval_with_next<FE, P, const D: usize>(&self, v: &[P], next_v: &[P]) -> P
-    where
-        FE: FieldExtension<D, BaseField = F>,
-        P: PackedField<Scalar = FE>,
-    {
-        self.linear_combination
-            .iter()
-            .map(|&(c, f)| v[c] * FE::from_basefield(f))
-            .sum::<P>()
-            + self
-                .next_row_linear_combination
-                .iter()
-                .map(|&(c, f)| next_v[c] * FE::from_basefield(f))
-                .sum::<P>()
-            + FE::from_basefield(self.constant)
-    }
-
-    /// Evaluate on an row of a table given in column-major form.
-    pub fn eval_table(&self, table: &[PolynomialValues<F>], row: usize) -> F {
-        let mut res = self
-            .linear_combination
-            .iter()
-            .map(|&(c, f)| table[c].values[row] * f)
-            .sum::<F>()
-            + self.constant;
-
-        // If we access the next row at the last row, for sanity, we consider the next row's values to be 0.
-        // If CTLs are correctly written, the filter should be 0 in that case anyway.
-        if !self.next_row_linear_combination.is_empty() && row < table[0].values.len() - 1 {
-            res += self
-                .next_row_linear_combination
-                .iter()
-                .map(|&(c, f)| table[c].values[row + 1] * f)
-                .sum::<F>();
-        }
-
-        res
-    }
-
-    pub fn eval_circuit<const D: usize>(
-        &self,
-        builder: &mut CircuitBuilder<F, D>,
-        v: &[ExtensionTarget<D>],
-    ) -> ExtensionTarget<D>
-    where
-        F: RichField + Extendable<D>,
-    {
-        let pairs = self
-            .linear_combination
-            .iter()
-            .map(|&(c, f)| {
-                (
-                    v[c],
-                    builder.constant_extension(F::Extension::from_basefield(f)),
-                )
-            })
-            .collect::<Vec<_>>();
-        let constant = builder.constant_extension(F::Extension::from_basefield(self.constant));
-        builder.inner_product_extension(F::ONE, constant, pairs)
-    }
-
-    pub fn eval_with_next_circuit<const D: usize>(
-        &self,
-        builder: &mut CircuitBuilder<F, D>,
-        v: &[ExtensionTarget<D>],
-        next_v: &[ExtensionTarget<D>],
-    ) -> ExtensionTarget<D>
-    where
-        F: RichField + Extendable<D>,
-    {
-        let mut pairs = self
-            .linear_combination
-            .iter()
-            .map(|&(c, f)| {
-                (
-                    v[c],
-                    builder.constant_extension(F::Extension::from_basefield(f)),
-                )
-            })
-            .collect::<Vec<_>>();
-        let next_row_pairs = self.next_row_linear_combination.iter().map(|&(c, f)| {
-            (
-                next_v[c],
-                builder.constant_extension(F::Extension::from_basefield(f)),
-            )
-        });
-        pairs.extend(next_row_pairs);
-        let constant = builder.constant_extension(F::Extension::from_basefield(self.constant));
-        builder.inner_product_extension(F::ONE, constant, pairs)
-    }
-}
+/// An alias for `usize`, to represent the index of a STARK table in a multi-STARK setting.
+pub(crate) type TableIdx = usize;
 
+/// A `table` index with a linear combination of columns and a filter.
+/// `filter` is used to determine the rows to select in `table`.
+/// `columns` represents linear combinations of the columns of `table`.
 #[derive(Clone, Debug)]
-pub struct TableWithColumns<F: Field> {
-    table: Table,
+pub(crate) struct TableWithColumns<F: Field> {
+    table: TableIdx,
     columns: Vec<Column<F>>,
-    pub(crate) filter_column: Option<Column<F>>,
+    pub(crate) filter: Option<Filter<F>>,
 }
 
 impl<F: Field> TableWithColumns<F> {
-    pub fn new(table: Table, columns: Vec<Column<F>>, filter_column: Option<Column<F>>) -> Self {
+    /// Generates a new `TableWithColumns` given a `table` index, a linear combination of columns `columns` and a `filter`.
+    pub(crate) fn new(table: TableIdx, columns: Vec<Column<F>>, filter: Option<Filter<F>>) -> Self {
         Self {
             table,
             columns,
-            filter_column,
+            filter,
         }
     }
 }
 
+/// Cross-table lookup data consisting in the lookup table (`looked_table`) and all the tables that look into `looked_table` (`looking_tables`).
+/// Each `looking_table` corresponds to a STARK's table whose rows have been filtered out and whose columns have been through a linear combination (see `eval_table`). The concatenation of those smaller tables should result in the `looked_table`.
 #[derive(Clone)]
 pub struct CrossTableLookup<F: Field> {
+    /// Column linear combinations for all tables that are looking into the current table.
     pub(crate) looking_tables: Vec<TableWithColumns<F>>,
+    /// Column linear combination for the current table.
     pub(crate) looked_table: TableWithColumns<F>,
 }
 
 impl<F: Field> CrossTableLookup<F> {
-    pub fn new(
+    /// Creates a new `CrossTableLookup` given some looking tables and a looked table.
+    /// All tables should have the same width.
+    pub(crate) fn new(
         looking_tables: Vec<TableWithColumns<F>>,
         looked_table: TableWithColumns<F>,
     ) -> Self {
@@ -285,102 +105,119 @@ impl<F: Field> CrossTableLookup<F> {
         }
     }
 
-    pub(crate) fn num_ctl_zs(ctls: &[Self], table: Table, num_challenges: usize) -> usize {
+    /// Given a table, returns:
+    /// - the total number of helper columns for this table, over all Cross-table lookups,
+    /// - the total number of z polynomials for this table, over all Cross-table lookups,
+    /// - the number of helper columns for this table, for each Cross-table lookup.
+    pub(crate) fn num_ctl_helpers_zs_all(
+        ctls: &[Self],
+        table: TableIdx,
+        num_challenges: usize,
+        constraint_degree: usize,
+    ) -> (usize, usize, Vec<usize>) {
+        let mut num_helpers = 0;
         let mut num_ctls = 0;
-        for ctl in ctls {
+        let mut num_helpers_by_ctl = vec![0; ctls.len()];
+        for (i, ctl) in ctls.iter().enumerate() {
             let all_tables = std::iter::once(&ctl.looked_table).chain(&ctl.looking_tables);
-            num_ctls += all_tables.filter(|twc| twc.table == table).count();
+            let num_appearances = all_tables.filter(|twc| twc.table == table).count();
+            let is_helpers = num_appearances > 2;
+            if is_helpers {
+                num_helpers_by_ctl[i] = ceil_div_usize(num_appearances, constraint_degree - 1);
+                num_helpers += num_helpers_by_ctl[i];
+            }
+
+            if num_appearances > 0 {
+                num_ctls += 1;
+            }
         }
-        num_ctls * num_challenges
+        (
+            num_helpers * num_challenges,
+            num_ctls * num_challenges,
+            num_helpers_by_ctl,
+        )
     }
 }
 
 /// Cross-table lookup data for one table.
 #[derive(Clone, Default)]
-pub struct CtlData<F: Field> {
-    pub(crate) zs_columns: Vec<CtlZData<F>>,
+pub(crate) struct CtlData<'a, F: Field> {
+    /// Data associated with all Z(x) polynomials for one table.
+    pub(crate) zs_columns: Vec<CtlZData<'a, F>>,
 }
 
 /// Cross-table lookup data associated with one Z(x) polynomial.
+/// One Z(x) polynomial can be associated to multiple tables,
+/// built from the same STARK.
 #[derive(Clone)]
-pub(crate) struct CtlZData<F: Field> {
+pub(crate) struct CtlZData<'a, F: Field> {
+    /// Helper columns to verify the Z polynomial values.
+    pub(crate) helper_columns: Vec<PolynomialValues<F>>,
+    /// Z polynomial values.
     pub(crate) z: PolynomialValues<F>,
+    /// Cross-table lookup challenge.
     pub(crate) challenge: GrandProductChallenge<F>,
-    pub(crate) columns: Vec<Column<F>>,
-    pub(crate) filter_column: Option<Column<F>>,
+    /// Vector of column linear combinations for the current tables.
+    pub(crate) columns: Vec<&'a [Column<F>]>,
+    /// Vector of filter columns for the current table.
+    /// Each filter evaluates to either 1 or 0.
+    pub(crate) filter: Vec<Option<Filter<F>>>,
 }
 
-impl<F: Field> CtlData<F> {
-    pub fn len(&self) -> usize {
+impl<'a, F: Field> CtlData<'a, F> {
+    /// Returns the number of cross-table lookup polynomials.
+    pub(crate) fn len(&self) -> usize {
         self.zs_columns.len()
     }
 
-    pub fn is_empty(&self) -> bool {
+    /// Returns whether there are no cross-table lookups.
+    pub(crate) fn is_empty(&self) -> bool {
         self.zs_columns.is_empty()
     }
 
-    pub fn z_polys(&self) -> Vec<PolynomialValues<F>> {
-        self.zs_columns
+    /// Returns all the cross-table lookup helper polynomials.
+    pub(crate) fn ctl_helper_polys(&self) -> Vec<PolynomialValues<F>> {
+        let num_polys = self
+            .zs_columns
             .iter()
-            .map(|zs_columns| zs_columns.z.clone())
-            .collect()
-    }
-}
-
-/// Randomness for a single instance of a permutation check protocol.
-#[derive(Copy, Clone, Eq, PartialEq, Debug)]
-pub(crate) struct GrandProductChallenge<T: Copy + Eq + PartialEq + Debug> {
-    /// Randomness used to combine multiple columns into one.
-    pub(crate) beta: T,
-    /// Random offset that's added to the beta-reduced column values.
-    pub(crate) gamma: T,
-}
+            .fold(0, |acc, z| acc + z.helper_columns.len());
+        let mut res = Vec::with_capacity(num_polys);
+        for z in &self.zs_columns {
+            res.extend(z.helper_columns.clone());
+        }
 
-impl<F: Field> GrandProductChallenge<F> {
-    pub(crate) fn combine<'a, FE, P, T: IntoIterator<Item = &'a P>, const D2: usize>(
-        &self,
-        terms: T,
-    ) -> P
-    where
-        FE: FieldExtension<D2, BaseField = F>,
-        P: PackedField<Scalar = FE>,
-        T::IntoIter: DoubleEndedIterator,
-    {
-        reduce_with_powers(terms, FE::from_basefield(self.beta)) + FE::from_basefield(self.gamma)
+        res
     }
-}
 
-impl GrandProductChallenge<Target> {
-    pub(crate) fn combine_circuit<F: RichField + Extendable<D>, const D: usize>(
-        &self,
-        builder: &mut CircuitBuilder<F, D>,
-        terms: &[ExtensionTarget<D>],
-    ) -> ExtensionTarget<D> {
-        let reduced = reduce_with_powers_ext_circuit(builder, terms, self.beta);
-        let gamma = builder.convert_to_ext(self.gamma);
-        builder.add_extension(reduced, gamma)
+    /// Returns all the Z cross-table-lookup polynomials.
+    pub(crate) fn ctl_z_polys(&self) -> Vec<PolynomialValues<F>> {
+        let mut res = Vec::with_capacity(self.zs_columns.len());
+        for z in &self.zs_columns {
+            res.push(z.z.clone());
+        }
+
+        res
     }
-}
+    /// Returns the number of helper columns for each STARK in each
+    /// `CtlZData`.
+    pub(crate) fn num_ctl_helper_polys(&self) -> Vec<usize> {
+        let mut res = Vec::with_capacity(self.zs_columns.len());
+        for z in &self.zs_columns {
+            res.push(z.helper_columns.len());
+        }
 
-impl GrandProductChallenge<Target> {
-    pub(crate) fn combine_base_circuit<F: RichField + Extendable<D>, const D: usize>(
-        &self,
-        builder: &mut CircuitBuilder<F, D>,
-        terms: &[Target],
-    ) -> Target {
-        let reduced = reduce_with_powers_circuit(builder, terms, self.beta);
-        builder.add(reduced, self.gamma)
+        res
     }
 }
 
 /// Like `PermutationChallenge`, but with `num_challenges` copies to boost soundness.
 #[derive(Clone, Eq, PartialEq, Debug)]
-pub(crate) struct GrandProductChallengeSet<T: Copy + Eq + PartialEq + Debug> {
+pub struct GrandProductChallengeSet<T: Copy + Eq + PartialEq + Debug> {
     pub(crate) challenges: Vec<GrandProductChallenge<T>>,
 }
 
 impl GrandProductChallengeSet<Target> {
-    pub fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
+    pub(crate) fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
         buffer.write_usize(self.challenges.len())?;
         for challenge in &self.challenges {
             buffer.write_target(challenge.beta)?;
@@ -389,7 +226,7 @@ impl GrandProductChallengeSet<Target> {
         Ok(())
     }
 
-    pub fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
+    pub(crate) fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
         let length = buffer.read_usize()?;
         let mut challenges = Vec::with_capacity(length);
         for _ in 0..length {
@@ -449,12 +286,47 @@ pub(crate) fn get_grand_product_challenge_set_target<
     GrandProductChallengeSet { challenges }
 }
 
-pub(crate) fn cross_table_lookup_data<F: RichField, const D: usize>(
-    trace_poly_values: &[Vec<PolynomialValues<F>>; NUM_TABLES],
-    cross_table_lookups: &[CrossTableLookup<F>],
+/// Returns the number of helper columns for each `Table`.
+pub(crate) fn num_ctl_helper_columns_by_table<F: Field, const N: usize>(
+    ctls: &[CrossTableLookup<F>],
+    constraint_degree: usize,
+) -> Vec<[usize; N]> {
+    let mut res = vec![[0; N]; ctls.len()];
+    for (i, ctl) in ctls.iter().enumerate() {
+        let CrossTableLookup {
+            looking_tables,
+            looked_table: _,
+        } = ctl;
+        let mut num_by_table = [0; N];
+
+        let grouped_lookups = looking_tables.iter().group_by(|&a| a.table);
+
+        for (table, group) in grouped_lookups.into_iter() {
+            let sum = group.count();
+            if sum > 2 {
+                // We only need helper columns if there are more than 2 columns.
+                num_by_table[table] = ceil_div_usize(sum, constraint_degree - 1);
+            }
+        }
+
+        res[i] = num_by_table;
+    }
+    res
+}
+
+/// Generates all the cross-table lookup data, for all tables.
+/// - `trace_poly_values` corresponds to the trace values for all tables.
+/// - `cross_table_lookups` corresponds to all the cross-table lookups, i.e. the looked and looking tables, as described in `CrossTableLookup`.
+/// - `ctl_challenges` corresponds to the challenges used for CTLs.
+/// - `constraint_degree` is the maximal constraint degree for the table.
+/// For each `CrossTableLookup`, and each looking/looked table, the partial products for the CTL are computed, and added to the said table's `CtlZData`.
+pub(crate) fn cross_table_lookup_data<'a, F: RichField, const D: usize, const N: usize>(
+    trace_poly_values: &[Vec<PolynomialValues<F>>; N],
+    cross_table_lookups: &'a [CrossTableLookup<F>],
     ctl_challenges: &GrandProductChallengeSet<F>,
-) -> [CtlData<F>; NUM_TABLES] {
-    let mut ctl_data_per_table = [0; NUM_TABLES].map(|_| CtlData::default());
+    constraint_degree: usize,
+) -> [CtlData<'a, F>; N] {
+    let mut ctl_data_per_table = [0; N].map(|_| CtlData::default());
     for CrossTableLookup {
         looking_tables,
         looked_table,
@@ -462,132 +334,270 @@ pub(crate) fn cross_table_lookup_data<F: RichField, const D: usize>(
     {
         log::debug!("Processing CTL for {:?}", looked_table.table);
         for &challenge in &ctl_challenges.challenges {
-            let zs_looking = looking_tables.iter().map(|table| {
-                partial_products(
-                    &trace_poly_values[table.table as usize],
-                    &table.columns,
-                    &table.filter_column,
-                    challenge,
-                )
-            });
-            let z_looked = partial_products(
-                &trace_poly_values[looked_table.table as usize],
-                &looked_table.columns,
-                &looked_table.filter_column,
+            let helper_zs_looking = ctl_helper_zs_cols(
+                trace_poly_values,
+                looking_tables.clone(),
                 challenge,
+                constraint_degree,
             );
-            for (table, z) in looking_tables.iter().zip(zs_looking) {
-                ctl_data_per_table[table.table as usize]
-                    .zs_columns
-                    .push(CtlZData {
-                        z,
-                        challenge,
-                        columns: table.columns.clone(),
-                        filter_column: table.filter_column.clone(),
-                    });
+
+            let z_looked = partial_sums(
+                &trace_poly_values[looked_table.table],
+                &[(&looked_table.columns, &looked_table.filter)],
+                challenge,
+                constraint_degree,
+            );
+
+            for (table, helpers_zs) in helper_zs_looking {
+                let num_helpers = helpers_zs.len() - 1;
+                let count = looking_tables
+                    .iter()
+                    .filter(|looking_table| looking_table.table == table)
+                    .count();
+                let cols_filts = looking_tables.iter().filter_map(|looking_table| {
+                    if looking_table.table == table {
+                        Some((&looking_table.columns, &looking_table.filter))
+                    } else {
+                        None
+                    }
+                });
+                let mut columns = Vec::with_capacity(count);
+                let mut filter = Vec::with_capacity(count);
+                for (col, filt) in cols_filts {
+                    columns.push(&col[..]);
+                    filter.push(filt.clone());
+                }
+                ctl_data_per_table[table].zs_columns.push(CtlZData {
+                    helper_columns: helpers_zs[..num_helpers].to_vec(),
+                    z: helpers_zs[num_helpers].clone(),
+                    challenge,
+                    columns,
+                    filter,
+                });
             }
-            ctl_data_per_table[looked_table.table as usize]
+            // There is no helper column for the looking table.
+            let looked_poly = z_looked[0].clone();
+            ctl_data_per_table[looked_table.table]
                 .zs_columns
                 .push(CtlZData {
-                    z: z_looked,
+                    helper_columns: vec![],
+                    z: looked_poly,
                     challenge,
-                    columns: looked_table.columns.clone(),
-                    filter_column: looked_table.filter_column.clone(),
+                    columns: vec![&looked_table.columns[..]],
+                    filter: vec![looked_table.filter.clone()],
                 });
         }
     }
     ctl_data_per_table
 }
 
-fn partial_products<F: Field>(
+/// Computes helper columns and Z polynomials for all looking tables
+/// of one cross-table lookup (i.e. for one looked table).
+fn ctl_helper_zs_cols<F: Field, const N: usize>(
+    all_stark_traces: &[Vec<PolynomialValues<F>>; N],
+    looking_tables: Vec<TableWithColumns<F>>,
+    challenge: GrandProductChallenge<F>,
+    constraint_degree: usize,
+) -> Vec<(usize, Vec<PolynomialValues<F>>)> {
+    let grouped_lookups = looking_tables.iter().group_by(|a| a.table);
+
+    grouped_lookups
+        .into_iter()
+        .map(|(table, group)| {
+            let columns_filters = group
+                .map(|table| (&table.columns[..], &table.filter))
+                .collect::<Vec<(&[Column<F>], &Option<Filter<F>>)>>();
+            (
+                table,
+                partial_sums(
+                    &all_stark_traces[table],
+                    &columns_filters,
+                    challenge,
+                    constraint_degree,
+                ),
+            )
+        })
+        .collect::<Vec<(usize, Vec<PolynomialValues<F>>)>>()
+}
+
+/// Computes the cross-table lookup partial sums for one table and given column linear combinations.
+/// `trace` represents the trace values for the given table.
+/// `columns` is a vector of column linear combinations to evaluate. Each element in the vector represents columns that need to be combined.
+/// `filter_cols` are column linear combinations used to determine whether a row should be selected.
+/// `challenge` is a cross-table lookup challenge.
+/// The initial sum `s` is 0.
+/// For each row, if the `filter_column` evaluates to 1, then the row is selected. All the column linear combinations are evaluated at said row.
+/// The evaluations of each elements of `columns` are then combined together to form a value `v`.
+/// The values `v`` are grouped together, in groups of size `constraint_degree - 1` (2 in our case). For each group, we construct a helper
+/// column: h = \sum_i 1/(v_i).
+///
+/// The sum is updated: `s += \sum h_i`, and is pushed to the vector of partial sums `z``.
+/// Returns the helper columns and `z`.
+fn partial_sums<F: Field>(
     trace: &[PolynomialValues<F>],
-    columns: &[Column<F>],
-    filter_column: &Option<Column<F>>,
+    columns_filters: &[ColumnFilter<F>],
     challenge: GrandProductChallenge<F>,
-) -> PolynomialValues<F> {
-    let mut partial_prod = F::ONE;
+    constraint_degree: usize,
+) -> Vec<PolynomialValues<F>> {
     let degree = trace[0].len();
-    let mut res = Vec::with_capacity(degree);
-    for i in (0..degree).rev() {
-        let filter = if let Some(column) = filter_column {
-            column.eval_table(trace, i)
-        } else {
-            F::ONE
-        };
-        if filter.is_one() {
-            let evals = columns
-                .iter()
-                .map(|c| c.eval_table(trace, i))
-                .collect::<Vec<_>>();
-            partial_prod *= challenge.combine(evals.iter());
-        } else {
-            assert_eq!(filter, F::ZERO, "Non-binary filter?")
-        };
-        res.push(partial_prod);
+    let mut z = Vec::with_capacity(degree);
+
+    let mut helper_columns =
+        get_helper_cols(trace, degree, columns_filters, challenge, constraint_degree);
+
+    let x = helper_columns
+        .iter()
+        .map(|col| col.values[degree - 1])
+        .sum::<F>();
+    z.push(x);
+
+    for i in (0..degree - 1).rev() {
+        let x = helper_columns.iter().map(|col| col.values[i]).sum::<F>();
+
+        z.push(z[z.len() - 1] + x);
+    }
+    z.reverse();
+    if columns_filters.len() > 2 {
+        helper_columns.push(z.into());
+    } else {
+        helper_columns = vec![z.into()];
     }
-    res.reverse();
-    res.into()
+
+    helper_columns
 }
 
+/// Data necessary to check the cross-table lookups of a given table.
 #[derive(Clone)]
-pub struct CtlCheckVars<'a, F, FE, P, const D2: usize>
+pub(crate) struct CtlCheckVars<'a, F, FE, P, const D2: usize>
 where
     F: Field,
     FE: FieldExtension<D2, BaseField = F>,
     P: PackedField<Scalar = FE>,
 {
+    /// Helper columns to check that the Z polyomial
+    /// was constructed correctly.
+    pub(crate) helper_columns: Vec<P>,
+    /// Evaluation of the trace polynomials at point `zeta`.
     pub(crate) local_z: P,
+    /// Evaluation of the trace polynomials at point `g * zeta`
     pub(crate) next_z: P,
+    /// Cross-table lookup challenges.
     pub(crate) challenges: GrandProductChallenge<F>,
-    pub(crate) columns: &'a [Column<F>],
-    pub(crate) filter_column: &'a Option<Column<F>>,
+    /// Column linear combinations of the `CrossTableLookup`s.
+    pub(crate) columns: Vec<&'a [Column<F>]>,
+    /// Filter that evaluates to either 1 or 0.
+    pub(crate) filter: Vec<Option<Filter<F>>>,
 }
 
 impl<'a, F: RichField + Extendable<D>, const D: usize>
     CtlCheckVars<'a, F, F::Extension, F::Extension, D>
 {
-    pub(crate) fn from_proofs<C: GenericConfig<D, F = F>>(
-        proofs: &[StarkProofWithMetadata<F, C, D>; NUM_TABLES],
+    /// Extracts the `CtlCheckVars` for each STARK.
+    pub(crate) fn from_proofs<C: GenericConfig<D, F = F>, const N: usize>(
+        proofs: &[StarkProofWithMetadata<F, C, D>; N],
         cross_table_lookups: &'a [CrossTableLookup<F>],
         ctl_challenges: &'a GrandProductChallengeSet<F>,
-        num_lookup_columns: &[usize; NUM_TABLES],
-    ) -> [Vec<Self>; NUM_TABLES] {
-        let mut ctl_zs = proofs
+        num_lookup_columns: &[usize; N],
+        num_helper_ctl_columns: &Vec<[usize; N]>,
+    ) -> [Vec<Self>; N] {
+        let mut total_num_helper_cols_by_table = [0; N];
+        for p_ctls in num_helper_ctl_columns {
+            for j in 0..N {
+                total_num_helper_cols_by_table[j] += p_ctls[j] * ctl_challenges.challenges.len();
+            }
+        }
+
+        // Get all cross-table lookup polynomial openings for each STARK proof.
+        let ctl_zs = proofs
             .iter()
             .zip(num_lookup_columns)
             .map(|(p, &num_lookup)| {
                 let openings = &p.proof.openings;
-                let ctl_zs = openings.auxiliary_polys.iter().skip(num_lookup);
-                let ctl_zs_next = openings.auxiliary_polys_next.iter().skip(num_lookup);
-                ctl_zs.zip(ctl_zs_next)
+
+                let ctl_zs = &openings.auxiliary_polys[num_lookup..];
+                let ctl_zs_next = &openings.auxiliary_polys_next[num_lookup..];
+                ctl_zs.iter().zip(ctl_zs_next).collect::<Vec<_>>()
             })
             .collect::<Vec<_>>();
 
-        let mut ctl_vars_per_table = [0; NUM_TABLES].map(|_| vec![]);
-        for CrossTableLookup {
-            looking_tables,
-            looked_table,
-        } in cross_table_lookups
+        // Put each cross-table lookup polynomial into the correct table data: if a CTL polynomial is extracted from looking/looked table t, then we add it to the `CtlCheckVars` of table t.
+        let mut start_indices = [0; N];
+        let mut z_indices = [0; N];
+        let mut ctl_vars_per_table = [0; N].map(|_| vec![]);
+        for (
+            CrossTableLookup {
+                looking_tables,
+                looked_table,
+            },
+            num_ctls,
+        ) in cross_table_lookups.iter().zip(num_helper_ctl_columns)
         {
             for &challenges in &ctl_challenges.challenges {
+                // Group looking tables by `Table`, since we bundle the looking tables taken from the same `Table` together thanks to helper columns.
+                // We want to only iterate on each `Table` once.
+                let mut filtered_looking_tables = Vec::with_capacity(min(looking_tables.len(), N));
                 for table in looking_tables {
-                    let (looking_z, looking_z_next) = ctl_zs[table.table as usize].next().unwrap();
-                    ctl_vars_per_table[table.table as usize].push(Self {
+                    if !filtered_looking_tables.contains(&(table.table)) {
+                        filtered_looking_tables.push(table.table);
+                    }
+                }
+
+                for &table in filtered_looking_tables.iter() {
+                    // We have first all the helper polynomials, then all the z polynomials.
+                    let (looking_z, looking_z_next) =
+                        ctl_zs[table][total_num_helper_cols_by_table[table] + z_indices[table]];
+
+                    let count = looking_tables
+                        .iter()
+                        .filter(|looking_table| looking_table.table == table)
+                        .count();
+                    let cols_filts = looking_tables.iter().filter_map(|looking_table| {
+                        if looking_table.table == table {
+                            Some((&looking_table.columns, &looking_table.filter))
+                        } else {
+                            None
+                        }
+                    });
+                    let mut columns = Vec::with_capacity(count);
+                    let mut filter = Vec::with_capacity(count);
+                    for (col, filt) in cols_filts {
+                        columns.push(&col[..]);
+                        filter.push(filt.clone());
+                    }
+                    let helper_columns = ctl_zs[table]
+                        [start_indices[table]..start_indices[table] + num_ctls[table]]
+                        .iter()
+                        .map(|&(h, _)| *h)
+                        .collect::<Vec<_>>();
+
+                    start_indices[table] += num_ctls[table];
+
+                    z_indices[table] += 1;
+                    ctl_vars_per_table[table].push(Self {
+                        helper_columns,
                         local_z: *looking_z,
                         next_z: *looking_z_next,
                         challenges,
-                        columns: &table.columns,
-                        filter_column: &table.filter_column,
+                        columns,
+                        filter,
                     });
                 }
 
-                let (looked_z, looked_z_next) = ctl_zs[looked_table.table as usize].next().unwrap();
-                ctl_vars_per_table[looked_table.table as usize].push(Self {
+                let (looked_z, looked_z_next) = ctl_zs[looked_table.table]
+                    [total_num_helper_cols_by_table[looked_table.table]
+                        + z_indices[looked_table.table]];
+
+                z_indices[looked_table.table] += 1;
+
+                let columns = vec![&looked_table.columns[..]];
+                let filter = vec![looked_table.filter.clone()];
+                ctl_vars_per_table[looked_table.table].push(Self {
+                    helper_columns: vec![],
                     local_z: *looked_z,
                     next_z: *looked_z_next,
                     challenges,
-                    columns: &looked_table.columns,
-                    filter_column: &looked_table.filter_column,
+                    columns,
+                    filter,
                 });
             }
         }
@@ -595,14 +605,18 @@ impl<'a, F: RichField + Extendable<D>, const D: usize>
     }
 }
 
-/// CTL Z partial products are upside down: the complete product is on the first row, and
+/// Checks the cross-table lookup Z polynomials for each table:
+/// - Checks that the CTL `Z` partial sums are correctly updated.
+/// - Checks that the final value of the CTL sum is the combination of all STARKs' CTL polynomials.
+/// CTL `Z` partial sums are upside down: the complete sum is on the first row, and
 /// the first term is on the last row. This allows the transition constraint to be:
-/// Z(w) = Z(gw) * combine(w) where combine is called on the local row
+/// `combine(w) * (Z(w) - Z(gw)) = filter` where combine is called on the local row
 /// and not the next. This enables CTLs across two rows.
 pub(crate) fn eval_cross_table_lookup_checks<F, FE, P, S, const D: usize, const D2: usize>(
     vars: &S::EvaluationFrame<FE, P, D2>,
     ctl_vars: &[CtlCheckVars<F, FE, P, D2>],
     consumer: &mut ConstraintConsumer<P>,
+    constraint_degree: usize,
 ) where
     F: RichField + Extendable<D>,
     FE: FieldExtension<D2, BaseField = F>,
@@ -614,96 +628,198 @@ pub(crate) fn eval_cross_table_lookup_checks<F, FE, P, S, const D: usize, const
 
     for lookup_vars in ctl_vars {
         let CtlCheckVars {
+            helper_columns,
             local_z,
             next_z,
             challenges,
             columns,
-            filter_column,
+            filter,
         } = lookup_vars;
 
+        // Compute all linear combinations on the current table, and combine them using the challenge.
         let evals = columns
             .iter()
-            .map(|c| c.eval_with_next(local_values, next_values))
+            .map(|col| {
+                col.iter()
+                    .map(|c| c.eval_with_next(local_values, next_values))
+                    .collect::<Vec<_>>()
+            })
             .collect::<Vec<_>>();
-        let combined = challenges.combine(evals.iter());
-        let local_filter = if let Some(column) = filter_column {
-            column.eval_with_next(local_values, next_values)
-        } else {
-            P::ONES
-        };
-        let select = local_filter * combined + P::ONES - local_filter;
 
-        // Check value of `Z(g^(n-1))`
-        consumer.constraint_last_row(*local_z - select);
-        // Check `Z(w) = combination * Z(gw)`
-        consumer.constraint_transition(*next_z * select - *local_z);
+        // Check helper columns.
+        eval_helper_columns(
+            filter,
+            &evals,
+            local_values,
+            next_values,
+            helper_columns,
+            constraint_degree,
+            challenges,
+            consumer,
+        );
+
+        if !helper_columns.is_empty() {
+            let h_sum = helper_columns.iter().fold(P::ZEROS, |acc, x| acc + *x);
+            // Check value of `Z(g^(n-1))`
+            consumer.constraint_last_row(*local_z - h_sum);
+            // Check `Z(w) = Z(gw) + \sum h_i`
+            consumer.constraint_transition(*local_z - *next_z - h_sum);
+        } else if columns.len() > 1 {
+            let combin0 = challenges.combine(&evals[0]);
+            let combin1 = challenges.combine(&evals[1]);
+
+            let f0 = if let Some(filter0) = &filter[0] {
+                filter0.eval_filter(local_values, next_values)
+            } else {
+                P::ONES
+            };
+            let f1 = if let Some(filter1) = &filter[1] {
+                filter1.eval_filter(local_values, next_values)
+            } else {
+                P::ONES
+            };
+
+            consumer
+                .constraint_last_row(combin0 * combin1 * *local_z - f0 * combin1 - f1 * combin0);
+            consumer.constraint_transition(
+                combin0 * combin1 * (*local_z - *next_z) - f0 * combin1 - f1 * combin0,
+            );
+        } else {
+            let combin0 = challenges.combine(&evals[0]);
+            let f0 = if let Some(filter0) = &filter[0] {
+                filter0.eval_filter(local_values, next_values)
+            } else {
+                P::ONES
+            };
+            consumer.constraint_last_row(combin0 * *local_z - f0);
+            consumer.constraint_transition(combin0 * (*local_z - *next_z) - f0);
+        }
     }
 }
 
+/// Circuit version of `CtlCheckVars`. Data necessary to check the cross-table lookups of a given table.
 #[derive(Clone)]
-pub struct CtlCheckVarsTarget<'a, F: Field, const D: usize> {
+pub(crate) struct CtlCheckVarsTarget<F: Field, const D: usize> {
+    ///Evaluation of the helper columns to check that the Z polyomial
+    /// was constructed correctly.
+    pub(crate) helper_columns: Vec<ExtensionTarget<D>>,
+    /// Evaluation of the trace polynomials at point `zeta`.
     pub(crate) local_z: ExtensionTarget<D>,
+    /// Evaluation of the trace polynomials at point `g * zeta`.
     pub(crate) next_z: ExtensionTarget<D>,
+    /// Cross-table lookup challenges.
     pub(crate) challenges: GrandProductChallenge<Target>,
-    pub(crate) columns: &'a [Column<F>],
-    pub(crate) filter_column: &'a Option<Column<F>>,
+    /// Column linear combinations of the `CrossTableLookup`s.
+    pub(crate) columns: Vec<Vec<Column<F>>>,
+    /// Filter that evaluates to either 1 or 0.
+    pub(crate) filter: Vec<Option<Filter<F>>>,
 }
 
-impl<'a, F: Field, const D: usize> CtlCheckVarsTarget<'a, F, D> {
+impl<'a, F: Field, const D: usize> CtlCheckVarsTarget<F, D> {
+    /// Circuit version of `from_proofs`. Extracts the `CtlCheckVarsTarget` for each STARK.
     pub(crate) fn from_proof(
-        table: Table,
+        table: TableIdx,
         proof: &StarkProofTarget<D>,
         cross_table_lookups: &'a [CrossTableLookup<F>],
         ctl_challenges: &'a GrandProductChallengeSet<Target>,
         num_lookup_columns: usize,
+        total_num_helper_columns: usize,
+        num_helper_ctl_columns: &[usize],
     ) -> Vec<Self> {
-        let mut ctl_zs = {
+        // Get all cross-table lookup polynomial openings for each STARK proof.
+        let ctl_zs = {
             let openings = &proof.openings;
             let ctl_zs = openings.auxiliary_polys.iter().skip(num_lookup_columns);
             let ctl_zs_next = openings
                 .auxiliary_polys_next
                 .iter()
                 .skip(num_lookup_columns);
-            ctl_zs.zip(ctl_zs_next)
+            ctl_zs.zip(ctl_zs_next).collect::<Vec<_>>()
         };
 
+        // Put each cross-table lookup polynomial into the correct table data: if a CTL polynomial is extracted from looking/looked table t, then we add it to the `CtlCheckVars` of table t.
+        let mut z_index = 0;
+        let mut start_index = 0;
         let mut ctl_vars = vec![];
-        for CrossTableLookup {
-            looking_tables,
-            looked_table,
-        } in cross_table_lookups
+        for (
+            i,
+            CrossTableLookup {
+                looking_tables,
+                looked_table,
+            },
+        ) in cross_table_lookups.iter().enumerate()
         {
             for &challenges in &ctl_challenges.challenges {
-                for looking_table in looking_tables {
+                // Group looking tables by `Table`, since we bundle the looking tables taken from the same `Table` together thanks to helper columns.
+
+                let count = looking_tables
+                    .iter()
+                    .filter(|looking_table| looking_table.table == table)
+                    .count();
+                let cols_filts = looking_tables.iter().filter_map(|looking_table| {
                     if looking_table.table == table {
-                        let (looking_z, looking_z_next) = ctl_zs.next().unwrap();
-                        ctl_vars.push(Self {
-                            local_z: *looking_z,
-                            next_z: *looking_z_next,
-                            challenges,
-                            columns: &looking_table.columns,
-                            filter_column: &looking_table.filter_column,
-                        });
+                        Some((&looking_table.columns, &looking_table.filter))
+                    } else {
+                        None
+                    }
+                });
+                if count > 0 {
+                    let mut columns = Vec::with_capacity(count);
+                    let mut filter = Vec::with_capacity(count);
+                    for (col, filt) in cols_filts {
+                        columns.push(col.clone());
+                        filter.push(filt.clone());
                     }
+                    let (looking_z, looking_z_next) = ctl_zs[total_num_helper_columns + z_index];
+                    let helper_columns = ctl_zs
+                        [start_index..start_index + num_helper_ctl_columns[i]]
+                        .iter()
+                        .map(|(&h, _)| h)
+                        .collect::<Vec<_>>();
+
+                    start_index += num_helper_ctl_columns[i];
+                    z_index += 1;
+                    // let columns = group.0.clone();
+                    // let filter = group.1.clone();
+                    ctl_vars.push(Self {
+                        helper_columns,
+                        local_z: *looking_z,
+                        next_z: *looking_z_next,
+                        challenges,
+                        columns,
+                        filter,
+                    });
                 }
 
                 if looked_table.table == table {
-                    let (looked_z, looked_z_next) = ctl_zs.next().unwrap();
+                    let (looked_z, looked_z_next) = ctl_zs[total_num_helper_columns + z_index];
+                    z_index += 1;
+
+                    let columns = vec![looked_table.columns.clone()];
+                    let filter = vec![looked_table.filter.clone()];
                     ctl_vars.push(Self {
+                        helper_columns: vec![],
                         local_z: *looked_z,
                         next_z: *looked_z_next,
                         challenges,
-                        columns: &looked_table.columns,
-                        filter_column: &looked_table.filter_column,
+                        columns,
+                        filter,
                     });
                 }
             }
         }
-        assert!(ctl_zs.next().is_none());
+
         ctl_vars
     }
 }
 
+/// Circuit version of `eval_cross_table_lookup_checks`. Checks the cross-table lookup Z polynomials for each table:
+/// - Checks that the CTL `Z` partial sums are correctly updated.
+/// - Checks that the final value of the CTL sum is the combination of all STARKs' CTL polynomials.
+/// CTL `Z` partial sums are upside down: the complete sum is on the first row, and
+/// the first term is on the last row. This allows the transition constraint to be:
+/// `combine(w) * (Z(w) - Z(gw)) = filter` where combine is called on the local row
+/// and not the next. This enables CTLs across two rows.
 pub(crate) fn eval_cross_table_lookup_checks_circuit<
     S: Stark<F, D>,
     F: RichField + Extendable<D>,
@@ -713,56 +829,106 @@ pub(crate) fn eval_cross_table_lookup_checks_circuit<
     vars: &S::EvaluationFrameTarget,
     ctl_vars: &[CtlCheckVarsTarget<F, D>],
     consumer: &mut RecursiveConstraintConsumer<F, D>,
+    constraint_degree: usize,
 ) {
     let local_values = vars.get_local_values();
     let next_values = vars.get_next_values();
 
+    let one = builder.one_extension();
+
     for lookup_vars in ctl_vars {
         let CtlCheckVarsTarget {
+            helper_columns,
             local_z,
             next_z,
             challenges,
             columns,
-            filter_column,
+            filter,
         } = lookup_vars;
 
-        let one = builder.one_extension();
-        let local_filter = if let Some(column) = filter_column {
-            column.eval_circuit(builder, local_values)
-        } else {
-            one
-        };
-        fn select<F: RichField + Extendable<D>, const D: usize>(
-            builder: &mut CircuitBuilder<F, D>,
-            filter: ExtensionTarget<D>,
-            x: ExtensionTarget<D>,
-        ) -> ExtensionTarget<D> {
-            let one = builder.one_extension();
-            let tmp = builder.sub_extension(one, filter);
-            builder.mul_add_extension(filter, x, tmp) // filter * x + 1 - filter
-        }
-
+        // Compute all linear combinations on the current table, and combine them using the challenge.
         let evals = columns
             .iter()
-            .map(|c| c.eval_with_next_circuit(builder, local_values, next_values))
+            .map(|col| {
+                col.iter()
+                    .map(|c| c.eval_with_next_circuit(builder, local_values, next_values))
+                    .collect::<Vec<_>>()
+            })
             .collect::<Vec<_>>();
 
-        let combined = challenges.combine_circuit(builder, &evals);
-        let select = select(builder, local_filter, combined);
+        // Check helper columns.
+        eval_helper_columns_circuit(
+            builder,
+            filter,
+            &evals,
+            local_values,
+            next_values,
+            helper_columns,
+            constraint_degree,
+            challenges,
+            consumer,
+        );
+
+        let z_diff = builder.sub_extension(*local_z, *next_z);
+        if !helper_columns.is_empty() {
+            // Check value of `Z(g^(n-1))`
+            let h_sum = builder.add_many_extension(helper_columns);
 
-        // Check value of `Z(g^(n-1))`
-        let last_row = builder.sub_extension(*local_z, select);
-        consumer.constraint_last_row(builder, last_row);
-        // Check `Z(w) = combination * Z(gw)`
-        let transition = builder.mul_sub_extension(*next_z, select, *local_z);
-        consumer.constraint_transition(builder, transition);
+            let last_row = builder.sub_extension(*local_z, h_sum);
+            consumer.constraint_last_row(builder, last_row);
+            // Check `Z(w) = Z(gw) * (filter / combination)`
+
+            let transition = builder.sub_extension(z_diff, h_sum);
+            consumer.constraint_transition(builder, transition);
+        } else if columns.len() > 1 {
+            let combin0 = challenges.combine_circuit(builder, &evals[0]);
+            let combin1 = challenges.combine_circuit(builder, &evals[1]);
+
+            let f0 = if let Some(filter0) = &filter[0] {
+                filter0.eval_filter_circuit(builder, local_values, next_values)
+            } else {
+                one
+            };
+            let f1 = if let Some(filter1) = &filter[1] {
+                filter1.eval_filter_circuit(builder, local_values, next_values)
+            } else {
+                one
+            };
+
+            let combined = builder.mul_sub_extension(combin1, *local_z, f1);
+            let combined = builder.mul_extension(combined, combin0);
+            let constr = builder.arithmetic_extension(F::NEG_ONE, F::ONE, f0, combin1, combined);
+            consumer.constraint_last_row(builder, constr);
+
+            let combined = builder.mul_sub_extension(combin1, z_diff, f1);
+            let combined = builder.mul_extension(combined, combin0);
+            let constr = builder.arithmetic_extension(F::NEG_ONE, F::ONE, f0, combin1, combined);
+            consumer.constraint_last_row(builder, constr);
+        } else {
+            let combin0 = challenges.combine_circuit(builder, &evals[0]);
+            let f0 = if let Some(filter0) = &filter[0] {
+                filter0.eval_filter_circuit(builder, local_values, next_values)
+            } else {
+                one
+            };
+
+            let constr = builder.mul_sub_extension(combin0, *local_z, f0);
+            consumer.constraint_last_row(builder, constr);
+            let constr = builder.mul_sub_extension(combin0, z_diff, f0);
+            consumer.constraint_transition(builder, constr);
+        }
     }
 }
 
-pub(crate) fn verify_cross_table_lookups<F: RichField + Extendable<D>, const D: usize>(
+/// Verifies all cross-table lookups.
+pub(crate) fn verify_cross_table_lookups<
+    F: RichField + Extendable<D>,
+    const D: usize,
+    const N: usize,
+>(
     cross_table_lookups: &[CrossTableLookup<F>],
-    ctl_zs_first: [Vec<F>; NUM_TABLES],
-    ctl_extra_looking_products: Vec<Vec<F>>,
+    ctl_zs_first: [Vec<F>; N],
+    ctl_extra_looking_sums: Vec<Vec<F>>,
     config: &StarkConfig,
 ) -> Result<()> {
     let mut ctl_zs_openings = ctl_zs_first.iter().map(|v| v.iter()).collect::<Vec<_>>();
@@ -774,17 +940,29 @@ pub(crate) fn verify_cross_table_lookups<F: RichField + Extendable<D>, const D:
         },
     ) in cross_table_lookups.iter().enumerate()
     {
-        let extra_product_vec = &ctl_extra_looking_products[looked_table.table as usize];
+        // Get elements looking into `looked_table` that are not associated to any STARK.
+        let extra_sum_vec = &ctl_extra_looking_sums[looked_table.table];
+        // We want to iterate on each looking table only once.
+        let mut filtered_looking_tables = vec![];
+        for table in looking_tables {
+            if !filtered_looking_tables.contains(&(table.table)) {
+                filtered_looking_tables.push(table.table);
+            }
+        }
         for c in 0..config.num_challenges {
-            let looking_zs_prod = looking_tables
+            // Compute the combination of all looking table CTL polynomial openings.
+
+            let looking_zs_sum = filtered_looking_tables
                 .iter()
-                .map(|table| *ctl_zs_openings[table.table as usize].next().unwrap())
-                .product::<F>()
-                * extra_product_vec[c];
+                .map(|&table| *ctl_zs_openings[table].next().unwrap())
+                .sum::<F>()
+                + extra_sum_vec[c];
 
-            let looked_z = *ctl_zs_openings[looked_table.table as usize].next().unwrap();
+            // Get the looked table CTL polynomial opening.
+            let looked_z = *ctl_zs_openings[looked_table.table].next().unwrap();
+            // Ensure that the combination of looking table openings is equal to the looked table opening.
             ensure!(
-                looking_zs_prod == looked_z,
+                looking_zs_sum == looked_z,
                 "Cross-table lookup {:?} verification failed.",
                 index
             );
@@ -795,11 +973,16 @@ pub(crate) fn verify_cross_table_lookups<F: RichField + Extendable<D>, const D:
     Ok(())
 }
 
-pub(crate) fn verify_cross_table_lookups_circuit<F: RichField + Extendable<D>, const D: usize>(
+/// Circuit version of `verify_cross_table_lookups`. Verifies all cross-table lookups.
+pub(crate) fn verify_cross_table_lookups_circuit<
+    F: RichField + Extendable<D>,
+    const D: usize,
+    const N: usize,
+>(
     builder: &mut CircuitBuilder<F, D>,
     cross_table_lookups: Vec<CrossTableLookup<F>>,
-    ctl_zs_first: [Vec<Target>; NUM_TABLES],
-    ctl_extra_looking_products: Vec<Vec<Target>>,
+    ctl_zs_first: [Vec<Target>; N],
+    ctl_extra_looking_sums: Vec<Vec<Target>>,
     inner_config: &StarkConfig,
 ) {
     let mut ctl_zs_openings = ctl_zs_first.iter().map(|v| v.iter()).collect::<Vec<_>>();
@@ -808,18 +991,29 @@ pub(crate) fn verify_cross_table_lookups_circuit<F: RichField + Extendable<D>, c
         looked_table,
     } in cross_table_lookups.into_iter()
     {
-        let extra_product_vec = &ctl_extra_looking_products[looked_table.table as usize];
+        // Get elements looking into `looked_table` that are not associated to any STARK.
+        let extra_sum_vec = &ctl_extra_looking_sums[looked_table.table];
+        // We want to iterate on each looking table only once.
+        let mut filtered_looking_tables = vec![];
+        for table in looking_tables {
+            if !filtered_looking_tables.contains(&(table.table)) {
+                filtered_looking_tables.push(table.table);
+            }
+        }
         for c in 0..inner_config.num_challenges {
-            let mut looking_zs_prod = builder.mul_many(
-                looking_tables
+            // Compute the combination of all looking table CTL polynomial openings.
+            let mut looking_zs_sum = builder.add_many(
+                filtered_looking_tables
                     .iter()
-                    .map(|table| *ctl_zs_openings[table.table as usize].next().unwrap()),
+                    .map(|&table| *ctl_zs_openings[table].next().unwrap()),
             );
 
-            looking_zs_prod = builder.mul(looking_zs_prod, extra_product_vec[c]);
+            looking_zs_sum = builder.add(looking_zs_sum, extra_sum_vec[c]);
 
-            let looked_z = *ctl_zs_openings[looked_table.table as usize].next().unwrap();
-            builder.connect(looked_z, looking_zs_prod);
+            // Get the looked table CTL polynomial opening.
+            let looked_z = *ctl_zs_openings[looked_table.table].next().unwrap();
+            // Verify that the combination of looking table openings is equal to the looked table opening.
+            builder.connect(looked_z, looking_zs_sum);
         }
     }
     debug_assert!(ctl_zs_openings.iter_mut().all(|iter| iter.next().is_none()));
@@ -899,10 +1093,10 @@ pub(crate) mod testutils {
         table: &TableWithColumns<F>,
         multiset: &mut MultiSet<F>,
     ) {
-        let trace = &trace_poly_values[table.table as usize];
+        let trace = &trace_poly_values[table.table];
         for i in 0..trace[0].len() {
-            let filter = if let Some(column) = &table.filter_column {
-                column.eval_table(trace, i)
+            let filter = if let Some(combin) = &table.filter {
+                combin.eval_table(trace, i)
             } else {
                 F::ONE
             };
@@ -912,7 +1106,10 @@ pub(crate) mod testutils {
                     .iter()
                     .map(|c| c.eval_table(trace, i))
                     .collect::<Vec<_>>();
-                multiset.entry(row).or_default().push((table.table, i));
+                multiset
+                    .entry(row)
+                    .or_default()
+                    .push((Table::all()[table.table], i));
             } else {
                 assert_eq!(filter, F::ZERO, "Non-binary filter?")
             }
diff --git a/evm/src/curve_pairings.rs b/evm/src/curve_pairings.rs
index d789051a2f..af155cc506 100644
--- a/evm/src/curve_pairings.rs
+++ b/evm/src/curve_pairings.rs
@@ -1,4 +1,4 @@
-use std::ops::{Add, Mul, Neg};
+use core::ops::{Add, Mul, Neg};
 
 use ethereum_types::U256;
 use rand::distributions::Standard;
@@ -8,7 +8,7 @@ use rand::Rng;
 use crate::extension_tower::{FieldExt, Fp12, Fp2, Fp6, Stack, BN254};
 
 #[derive(Debug, Copy, Clone, PartialEq)]
-pub struct Curve<T>
+pub(crate) struct Curve<T>
 where
     T: FieldExt,
 {
@@ -17,7 +17,7 @@ where
 }
 
 impl<T: FieldExt> Curve<T> {
-    pub fn unit() -> Self {
+    pub(crate) const fn unit() -> Self {
         Curve {
             x: T::ZERO,
             y: T::ZERO,
@@ -47,7 +47,7 @@ where
     T: FieldExt,
     Curve<T>: CyclicGroup,
 {
-    pub fn int(z: i32) -> Self {
+    pub(crate) fn int(z: i32) -> Self {
         Curve::<T>::GENERATOR * z
     }
 }
@@ -63,7 +63,7 @@ where
 }
 
 /// Standard addition formula for elliptic curves, restricted to the cases  
-/// https://en.wikipedia.org/wiki/Elliptic_curve#Algebraic_interpretation
+/// <https://en.wikipedia.org/wiki/Elliptic_curve#Algebraic_interpretation>
 impl<T: FieldExt> Add for Curve<T> {
     type Output = Self;
 
@@ -195,15 +195,15 @@ impl CyclicGroup for Curve<Fp2<BN254>> {
 }
 
 // The tate pairing takes a point each from the curve and its twist and outputs an Fp12 element
-pub fn bn_tate(p: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
+pub(crate) fn bn_tate(p: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
     let miller_output = bn_miller_loop(p, q);
     bn_final_exponent(miller_output)
 }
 
 /// Standard code for miller loop, can be found on page 99 at this url:
-/// https://static1.squarespace.com/static/5fdbb09f31d71c1227082339/t/5ff394720493bd28278889c6/1609798774687/PairingsForBeginners.pdf#page=107
+/// <https://static1.squarespace.com/static/5fdbb09f31d71c1227082339/t/5ff394720493bd28278889c6/1609798774687/PairingsForBeginners.pdf#page=107>
 /// where BN_EXP is a hardcoding of the array of Booleans that the loop traverses
-pub fn bn_miller_loop(p: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
+pub(crate) fn bn_miller_loop(p: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
     let mut r = p;
     let mut acc: Fp12<BN254> = Fp12::<BN254>::UNIT;
     let mut line: Fp12<BN254>;
@@ -222,14 +222,14 @@ pub fn bn_miller_loop(p: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
 }
 
 /// The sloped line function for doubling a point
-pub fn bn_tangent(p: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
+pub(crate) fn bn_tangent(p: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
     let cx = -BN254::new(3) * p.x * p.x;
     let cy = BN254::new(2) * p.y;
     bn_sparse_embed(p.y * p.y - BN254::new(9), q.x * cx, q.y * cy)
 }
 
 /// The sloped line function for adding two points
-pub fn bn_cord(p1: Curve<BN254>, p2: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
+pub(crate) fn bn_cord(p1: Curve<BN254>, p2: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12<BN254> {
     let cx = p2.y - p1.y;
     let cy = p1.x - p2.x;
     bn_sparse_embed(p1.y * p2.x - p2.y * p1.x, q.x * cx, q.y * cy)
@@ -237,7 +237,7 @@ pub fn bn_cord(p1: Curve<BN254>, p2: Curve<BN254>, q: Curve<Fp2<BN254>>) -> Fp12
 
 /// The tangent and cord functions output sparse Fp12 elements.
 /// This map embeds the nonzero coefficients into an Fp12.
-pub fn bn_sparse_embed(g000: BN254, g01: Fp2<BN254>, g11: Fp2<BN254>) -> Fp12<BN254> {
+pub(crate) const fn bn_sparse_embed(g000: BN254, g01: Fp2<BN254>, g11: Fp2<BN254>) -> Fp12<BN254> {
     let g0 = Fp6 {
         t0: Fp2 {
             re: g000,
@@ -256,7 +256,7 @@ pub fn bn_sparse_embed(g000: BN254, g01: Fp2<BN254>, g11: Fp2<BN254>) -> Fp12<BN
     Fp12 { z0: g0, z1: g1 }
 }
 
-pub fn gen_bn_fp12_sparse<R: Rng + ?Sized>(rng: &mut R) -> Fp12<BN254> {
+pub(crate) fn gen_bn_fp12_sparse<R: Rng + ?Sized>(rng: &mut R) -> Fp12<BN254> {
     bn_sparse_embed(
         rng.gen::<BN254>(),
         rng.gen::<Fp2<BN254>>(),
@@ -276,7 +276,7 @@ pub fn gen_bn_fp12_sparse<R: Rng + ?Sized>(rng: &mut R) -> Fp12<BN254> {
 ///     (p^4 - p^2 + 1)/N = p^3 + (a2)p^2 - (a1)p - a0
 /// where 0 < a0, a1, a2 < p. Then the final power is given by
 ///     y = y_3 * (y^a2)_2 * (y^-a1)_1 * (y^-a0)
-pub fn bn_final_exponent(f: Fp12<BN254>) -> Fp12<BN254> {
+pub(crate) fn bn_final_exponent(f: Fp12<BN254>) -> Fp12<BN254> {
     let mut y = f.frob(6) / f;
     y = y.frob(2) * y;
     let (y_a2, y_a1, y_a0) = get_bn_custom_powers(y);
@@ -370,7 +370,7 @@ const BN_EXP: [bool; 253] = [
     false,
 ];
 
-// The folowing constants are defined above get_custom_powers
+// The following constants are defined above get_custom_powers
 
 const BN_EXPS4: [(bool, bool, bool); 64] = [
     (true, true, false),
diff --git a/evm/src/extension_tower.rs b/evm/src/extension_tower.rs
index 845d99aa63..ea4e317641 100644
--- a/evm/src/extension_tower.rs
+++ b/evm/src/extension_tower.rs
@@ -1,5 +1,5 @@
-use std::fmt::Debug;
-use std::ops::{Add, Div, Mul, Neg, Sub};
+use core::fmt::Debug;
+use core::ops::{Add, Div, Mul, Neg, Sub};
 
 use ethereum_types::{U256, U512};
 use rand::distributions::{Distribution, Standard};
@@ -21,7 +21,7 @@ pub trait FieldExt:
     fn inv(self) -> Self;
 }
 
-pub const BN_BASE: U256 = U256([
+pub(crate) const BN_BASE: U256 = U256([
     0x3c208c16d87cfd47,
     0x97816a916871ca8d,
     0xb85045b68181585d,
@@ -29,7 +29,7 @@ pub const BN_BASE: U256 = U256([
 ]);
 
 #[derive(Debug, Copy, Clone, PartialEq)]
-pub struct BN254 {
+pub(crate) struct BN254 {
     pub val: U256,
 }
 
@@ -114,7 +114,7 @@ impl Div for BN254 {
     }
 }
 
-pub const BLS_BASE: U512 = U512([
+pub(crate) const BLS_BASE: U512 = U512([
     0xb9feffffffffaaab,
     0x1eabfffeb153ffff,
     0x6730d2a0f6b0f624,
@@ -126,16 +126,16 @@ pub const BLS_BASE: U512 = U512([
 ]);
 
 #[derive(Debug, Copy, Clone, PartialEq)]
-pub struct BLS381 {
+pub(crate) struct BLS381 {
     pub val: U512,
 }
 
 impl BLS381 {
-    pub fn lo(self) -> U256 {
+    pub(crate) fn lo(self) -> U256 {
         U256(self.val.0[..4].try_into().unwrap())
     }
 
-    pub fn hi(self) -> U256 {
+    pub(crate) fn hi(self) -> U256 {
         U256(self.val.0[4..].try_into().unwrap())
     }
 }
@@ -260,7 +260,7 @@ impl Div for BLS381 {
 /// The degree 2 field extension Fp2 is given by adjoining i, the square root of -1, to BN254
 /// The arithmetic in this extension is standard complex arithmetic
 #[derive(Debug, Copy, Clone, PartialEq)]
-pub struct Fp2<T>
+pub(crate) struct Fp2<T>
 where
     T: FieldExt,
 {
@@ -812,7 +812,7 @@ impl Adj for Fp2<BLS381> {
 /// The degree 3 field extension Fp6 over Fp2 is given by adjoining t, where t^3 = 1 + i
 /// Fp6 has basis 1, t, t^2 over Fp2
 #[derive(Debug, Copy, Clone, PartialEq)]
-pub struct Fp6<T>
+pub(crate) struct Fp6<T>
 where
     T: FieldExt,
     Fp2<T>: Adj,
@@ -944,7 +944,7 @@ where
     /// while the values of
     ///     t^(p^n) and t^(2p^n)
     /// are precomputed in the constant arrays FROB_T1 and FROB_T2
-    pub fn frob(self, n: usize) -> Fp6<T> {
+    pub(crate) fn frob(self, n: usize) -> Fp6<T> {
         let n = n % 6;
         let frob_t1 = Fp2::<T>::FROB_T[0][n];
         let frob_t2 = Fp2::<T>::FROB_T[1][n];
@@ -1031,7 +1031,7 @@ where
 /// The degree 2 field extension Fp12 over Fp6 is given by
 /// adjoining z, where z^2 = t. It thus has basis 1, z over Fp6
 #[derive(Debug, Copy, Clone, PartialEq)]
-pub struct Fp12<T>
+pub(crate) struct Fp12<T>
 where
     T: FieldExt,
     Fp2<T>: Adj,
@@ -1068,7 +1068,7 @@ where
     ///     (Prod_{i=1}^11 x_i) / phi
     /// The 6th Frob map is nontrivial but leaves Fp6 fixed and hence must be the conjugate:
     ///     x_6 = (a + bz)_6 = a - bz = x.conj()
-    /// Letting prod_17 = x_1 * x_7, the remaining factors in the numerator can be expresed as:
+    /// Letting prod_17 = x_1 * x_7, the remaining factors in the numerator can be expressed as:
     ///     [(prod_17) * (prod_17)_2] * (prod_17)_4 * [(prod_17) * (prod_17)_2]_1
     /// By Galois theory, both the following are in Fp2 and are complex conjugates
     ///     prod_odds,  prod_evens
@@ -1200,7 +1200,7 @@ where
     /// which sends a + bz: Fp12 to
     ///     a^(p^n) + b^(p^n) * z^(p^n)
     /// where the values of z^(p^n) are precomputed in the constant array FROB_Z
-    pub fn frob(self, n: usize) -> Fp12<T> {
+    pub(crate) fn frob(self, n: usize) -> Fp12<T> {
         let n = n % 12;
         Fp12 {
             z0: self.z0.frob(n),
diff --git a/evm/src/fixed_recursive_verifier.rs b/evm/src/fixed_recursive_verifier.rs
index 42919a97eb..2df85b03de 100644
--- a/evm/src/fixed_recursive_verifier.rs
+++ b/evm/src/fixed_recursive_verifier.rs
@@ -1,7 +1,10 @@
 use core::mem::{self, MaybeUninit};
+use core::ops::Range;
 use std::collections::BTreeMap;
-use std::ops::Range;
+use std::sync::atomic::AtomicBool;
+use std::sync::Arc;
 
+use anyhow::anyhow;
 use eth_trie_utils::partial_trie::{HashedPartialTrie, Node, PartialTrie};
 use hashbrown::HashMap;
 use itertools::{zip_eq, Itertools};
@@ -15,7 +18,7 @@ use plonky2::iop::target::{BoolTarget, Target};
 use plonky2::iop::witness::{PartialWitness, WitnessWrite};
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::circuit_data::{
-    CircuitConfig, CircuitData, CommonCircuitData, VerifierCircuitTarget,
+    CircuitConfig, CircuitData, CommonCircuitData, VerifierCircuitData, VerifierCircuitTarget,
 };
 use plonky2::plonk::config::{AlgebraicHasher, GenericConfig};
 use plonky2::plonk::proof::{ProofWithPublicInputs, ProofWithPublicInputsTarget};
@@ -36,14 +39,14 @@ use crate::cross_table_lookup::{
 use crate::generation::GenerationInputs;
 use crate::get_challenges::observe_public_values_target;
 use crate::proof::{
-    BlockHashesTarget, BlockMetadataTarget, ExtraBlockDataTarget, PublicValues, PublicValuesTarget,
-    StarkProofWithMetadata, TrieRootsTarget,
+    AllProof, BlockHashesTarget, BlockMetadataTarget, ExtraBlockData, ExtraBlockDataTarget,
+    PublicValues, PublicValuesTarget, StarkProofWithMetadata, TrieRoots, TrieRootsTarget,
 };
-use crate::prover::prove;
+use crate::prover::{check_abort_signal, prove};
 use crate::recursive_verifier::{
-    add_common_recursion_gates, add_virtual_public_values,
-    get_memory_extra_looking_products_circuit, recursive_stark_circuit, set_public_value_targets,
-    PlonkWrapperCircuit, PublicInputs, StarkWrapperCircuit,
+    add_common_recursion_gates, add_virtual_public_values, get_memory_extra_looking_sum_circuit,
+    recursive_stark_circuit, set_public_value_targets, PlonkWrapperCircuit, PublicInputs,
+    StarkWrapperCircuit,
 };
 use crate::stark::Stark;
 use crate::util::h256_limbs;
@@ -64,11 +67,13 @@ where
 {
     /// The EVM root circuit, which aggregates the (shrunk) per-table recursive proofs.
     pub root: RootCircuitData<F, C, D>,
+    /// The aggregation circuit, which verifies two proofs that can either be root or
+    /// aggregation proofs.
     pub aggregation: AggregationCircuitData<F, C, D>,
-    /// The block circuit, which verifies an aggregation root proof and a previous block proof.
+    /// The block circuit, which verifies an aggregation root proof and an optional previous block proof.
     pub block: BlockCircuitData<F, C, D>,
     /// Holds chains of circuits for each table and for each initial `degree_bits`.
-    by_table: [RecursiveCircuitsForTable<F, C, D>; NUM_TABLES],
+    pub by_table: [RecursiveCircuitsForTable<F, C, D>; NUM_TABLES],
 }
 
 /// Data for the EVM root circuit, which is used to combine each STARK's shrunk wrapper proof
@@ -96,7 +101,7 @@ where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
 {
-    pub fn to_buffer(
+    fn to_buffer(
         &self,
         buffer: &mut Vec<u8>,
         gate_serializer: &dyn GateSerializer<F, D>,
@@ -114,7 +119,7 @@ where
         Ok(())
     }
 
-    pub fn from_buffer(
+    fn from_buffer(
         buffer: &mut Buffer,
         gate_serializer: &dyn GateSerializer<F, D>,
         generator_serializer: &dyn WitnessGeneratorSerializer<F, D>,
@@ -161,7 +166,7 @@ where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
 {
-    pub fn to_buffer(
+    fn to_buffer(
         &self,
         buffer: &mut Vec<u8>,
         gate_serializer: &dyn GateSerializer<F, D>,
@@ -175,7 +180,7 @@ where
         Ok(())
     }
 
-    pub fn from_buffer(
+    fn from_buffer(
         buffer: &mut Buffer,
         gate_serializer: &dyn GateSerializer<F, D>,
         generator_serializer: &dyn WitnessGeneratorSerializer<F, D>,
@@ -196,21 +201,21 @@ where
 }
 
 #[derive(Eq, PartialEq, Debug)]
-pub struct AggregationChildTarget<const D: usize> {
+struct AggregationChildTarget<const D: usize> {
     is_agg: BoolTarget,
     agg_proof: ProofWithPublicInputsTarget<D>,
     evm_proof: ProofWithPublicInputsTarget<D>,
 }
 
 impl<const D: usize> AggregationChildTarget<D> {
-    pub fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
+    fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
         buffer.write_target_bool(self.is_agg)?;
         buffer.write_target_proof_with_public_inputs(&self.agg_proof)?;
         buffer.write_target_proof_with_public_inputs(&self.evm_proof)?;
         Ok(())
     }
 
-    pub fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
+    fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
         let is_agg = buffer.read_target_bool()?;
         let agg_proof = buffer.read_target_proof_with_public_inputs()?;
         let evm_proof = buffer.read_target_proof_with_public_inputs()?;
@@ -221,7 +226,7 @@ impl<const D: usize> AggregationChildTarget<D> {
         })
     }
 
-    pub fn public_values<F: RichField + Extendable<D>>(
+    fn public_values<F: RichField + Extendable<D>>(
         &self,
         builder: &mut CircuitBuilder<F, D>,
     ) -> PublicValuesTarget {
@@ -231,6 +236,8 @@ impl<const D: usize> AggregationChildTarget<D> {
     }
 }
 
+/// Data for the block circuit, which is used to generate a final block proof,
+/// and compress it with an optional parent proof if present.
 #[derive(Eq, PartialEq, Debug)]
 pub struct BlockCircuitData<F, C, const D: usize>
 where
@@ -250,7 +257,7 @@ where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
 {
-    pub fn to_buffer(
+    fn to_buffer(
         &self,
         buffer: &mut Vec<u8>,
         gate_serializer: &dyn GateSerializer<F, D>,
@@ -265,7 +272,7 @@ where
         Ok(())
     }
 
-    pub fn from_buffer(
+    fn from_buffer(
         buffer: &mut Buffer,
         gate_serializer: &dyn GateSerializer<F, D>,
         generator_serializer: &dyn WitnessGeneratorSerializer<F, D>,
@@ -293,8 +300,19 @@ where
     C: GenericConfig<D, F = F> + 'static,
     C::Hasher: AlgebraicHasher<F>,
 {
+    /// Serializes all these preprocessed circuits into a sequence of bytes.
+    ///
+    /// # Arguments
+    ///
+    /// - `skip_tables`: a boolean indicating whether to serialize only the upper circuits
+    /// or the entire prover state, including recursive circuits to shrink STARK proofs.
+    /// - `gate_serializer`: a custom gate serializer needed to serialize recursive circuits
+    /// common data.
+    /// - `generator_serializer`: a custom generator serializer needed to serialize recursive
+    /// circuits proving data.
     pub fn to_bytes(
         &self,
+        skip_tables: bool,
         gate_serializer: &dyn GateSerializer<F, D>,
         generator_serializer: &dyn WitnessGeneratorSerializer<F, D>,
     ) -> IoResult<Vec<u8>> {
@@ -306,14 +324,28 @@ where
             .to_buffer(&mut buffer, gate_serializer, generator_serializer)?;
         self.block
             .to_buffer(&mut buffer, gate_serializer, generator_serializer)?;
-        for table in &self.by_table {
-            table.to_buffer(&mut buffer, gate_serializer, generator_serializer)?;
+        if !skip_tables {
+            for table in &self.by_table {
+                table.to_buffer(&mut buffer, gate_serializer, generator_serializer)?;
+            }
         }
         Ok(buffer)
     }
 
+    /// Deserializes a sequence of bytes into an entire prover state containing all recursive circuits.
+    ///
+    /// # Arguments
+    ///
+    /// - `bytes`: a slice of bytes to deserialize this prover state from.
+    /// - `skip_tables`: a boolean indicating whether to deserialize only the upper circuits
+    /// or the entire prover state, including recursive circuits to shrink STARK proofs.
+    /// - `gate_serializer`: a custom gate serializer needed to serialize recursive circuits
+    /// common data.
+    /// - `generator_serializer`: a custom generator serializer needed to serialize recursive
+    /// circuits proving data.
     pub fn from_bytes(
         bytes: &[u8],
+        skip_tables: bool,
         gate_serializer: &dyn GateSerializer<F, D>,
         generator_serializer: &dyn WitnessGeneratorSerializer<F, D>,
     ) -> IoResult<Self> {
@@ -328,21 +360,30 @@ where
         let block =
             BlockCircuitData::from_buffer(&mut buffer, gate_serializer, generator_serializer)?;
 
-        // Tricky use of MaybeUninit to remove the need for implementing Debug
-        // for all underlying types, necessary to convert a by_table Vec to an array.
-        let by_table = {
-            let mut by_table: [MaybeUninit<RecursiveCircuitsForTable<F, C, D>>; NUM_TABLES] =
-                unsafe { MaybeUninit::uninit().assume_init() };
-            for table in &mut by_table[..] {
-                let value = RecursiveCircuitsForTable::from_buffer(
-                    &mut buffer,
-                    gate_serializer,
-                    generator_serializer,
-                )?;
-                *table = MaybeUninit::new(value);
-            }
-            unsafe {
-                mem::transmute::<_, [RecursiveCircuitsForTable<F, C, D>; NUM_TABLES]>(by_table)
+        let by_table = match skip_tables {
+            true => (0..NUM_TABLES)
+                .map(|_| RecursiveCircuitsForTable {
+                    by_stark_size: BTreeMap::default(),
+                })
+                .collect_vec()
+                .try_into()
+                .unwrap(),
+            false => {
+                // Tricky use of MaybeUninit to remove the need for implementing Debug
+                // for all underlying types, necessary to convert a by_table Vec to an array.
+                let mut by_table: [MaybeUninit<RecursiveCircuitsForTable<F, C, D>>; NUM_TABLES] =
+                    unsafe { MaybeUninit::uninit().assume_init() };
+                for table in &mut by_table[..] {
+                    let value = RecursiveCircuitsForTable::from_buffer(
+                        &mut buffer,
+                        gate_serializer,
+                        generator_serializer,
+                    )?;
+                    *table = MaybeUninit::new(value);
+                }
+                unsafe {
+                    mem::transmute::<_, [RecursiveCircuitsForTable<F, C, D>; NUM_TABLES]>(by_table)
+                }
             }
         };
 
@@ -355,6 +396,19 @@ where
     }
 
     /// Preprocess all recursive circuits used by the system.
+    ///
+    /// # Arguments
+    ///
+    /// - `all_stark`: a structure defining the logic of all STARK modules and their associated
+    /// cross-table lookups.
+    /// - `degree_bits_ranges`: the logarithmic ranges to be supported for the recursive tables.
+    /// Transactions may yield arbitrary trace lengths for each STARK module (within some bounds),
+    /// unknown prior generating the witness to create a proof. Thus, for each STARK module, we
+    /// construct a map from `2^{degree_bits} = length` to a chain of shrinking recursion circuits,
+    /// starting from that length, for each `degree_bits` in the range specified for this STARK module.
+    /// Specifying a wide enough range allows a prover to cover all possible scenarios.
+    /// - `stark_config`: the configuration to be used for the STARK prover. It will usually be a fast
+    /// one yielding large proofs.
     pub fn new(
         all_stark: &AllStark<F, D>,
         degree_bits_ranges: &[Range<usize>; NUM_TABLES],
@@ -363,49 +417,49 @@ where
         let arithmetic = RecursiveCircuitsForTable::new(
             Table::Arithmetic,
             &all_stark.arithmetic_stark,
-            degree_bits_ranges[Table::Arithmetic as usize].clone(),
+            degree_bits_ranges[*Table::Arithmetic].clone(),
             &all_stark.cross_table_lookups,
             stark_config,
         );
         let byte_packing = RecursiveCircuitsForTable::new(
             Table::BytePacking,
             &all_stark.byte_packing_stark,
-            degree_bits_ranges[Table::BytePacking as usize].clone(),
+            degree_bits_ranges[*Table::BytePacking].clone(),
             &all_stark.cross_table_lookups,
             stark_config,
         );
         let cpu = RecursiveCircuitsForTable::new(
             Table::Cpu,
             &all_stark.cpu_stark,
-            degree_bits_ranges[Table::Cpu as usize].clone(),
+            degree_bits_ranges[*Table::Cpu].clone(),
             &all_stark.cross_table_lookups,
             stark_config,
         );
         let keccak = RecursiveCircuitsForTable::new(
             Table::Keccak,
             &all_stark.keccak_stark,
-            degree_bits_ranges[Table::Keccak as usize].clone(),
+            degree_bits_ranges[*Table::Keccak].clone(),
             &all_stark.cross_table_lookups,
             stark_config,
         );
         let keccak_sponge = RecursiveCircuitsForTable::new(
             Table::KeccakSponge,
             &all_stark.keccak_sponge_stark,
-            degree_bits_ranges[Table::KeccakSponge as usize].clone(),
+            degree_bits_ranges[*Table::KeccakSponge].clone(),
             &all_stark.cross_table_lookups,
             stark_config,
         );
         let logic = RecursiveCircuitsForTable::new(
             Table::Logic,
             &all_stark.logic_stark,
-            degree_bits_ranges[Table::Logic as usize].clone(),
+            degree_bits_ranges[*Table::Logic].clone(),
             &all_stark.cross_table_lookups,
             stark_config,
         );
         let memory = RecursiveCircuitsForTable::new(
             Table::Memory,
             &all_stark.memory_stark,
-            degree_bits_ranges[Table::Memory as usize].clone(),
+            degree_bits_ranges[*Table::Memory].clone(),
             &all_stark.cross_table_lookups,
             stark_config,
         );
@@ -430,6 +484,25 @@ where
         }
     }
 
+    /// Outputs the `VerifierCircuitData` needed to verify any block proof
+    /// generated by an honest prover.
+    /// While the [`AllRecursiveCircuits`] prover state can also verify proofs, verifiers
+    /// only need a fraction of the state to verify proofs. This allows much less powerful
+    /// entities to behave as verifiers, by only loading the necessary data to verify block proofs.
+    ///
+    /// # Usage
+    ///
+    /// ```ignore
+    /// let prover_state = AllRecursiveCircuits { ... };
+    /// let verifier_state = prover_state.final_verifier_data();
+    ///
+    /// // Verify a provided block proof
+    /// assert!(verifier_state.verify(&block_proof).is_ok());
+    /// ```
+    pub fn final_verifier_data(&self) -> VerifierCircuitData<F, C, D> {
+        self.block.circuit.verifier_data()
+    }
+
     fn create_root_circuit(
         by_table: &[RecursiveCircuitsForTable<F, C, D>; NUM_TABLES],
         stark_config: &StarkConfig,
@@ -493,15 +566,15 @@ where
             }
         }
 
-        // Extra products to add to the looked last value.
+        // Extra sums to add to the looked last value.
         // Only necessary for the Memory values.
-        let mut extra_looking_products =
-            vec![vec![builder.one(); stark_config.num_challenges]; NUM_TABLES];
+        let mut extra_looking_sums =
+            vec![vec![builder.zero(); stark_config.num_challenges]; NUM_TABLES];
 
         // Memory
-        extra_looking_products[Table::Memory as usize] = (0..stark_config.num_challenges)
+        extra_looking_sums[*Table::Memory] = (0..stark_config.num_challenges)
             .map(|c| {
-                get_memory_extra_looking_products_circuit(
+                get_memory_extra_looking_sum_circuit(
                     &mut builder,
                     &public_values,
                     ctl_challenges.challenges[c],
@@ -510,11 +583,11 @@ where
             .collect_vec();
 
         // Verify the CTL checks.
-        verify_cross_table_lookups_circuit::<F, D>(
+        verify_cross_table_lookups_circuit::<F, D, NUM_TABLES>(
             &mut builder,
             all_cross_table_lookups(),
             pis.map(|p| p.ctl_zs_first),
-            extra_looking_products,
+            extra_looking_sums,
             stark_config,
         );
 
@@ -643,18 +716,18 @@ where
         lhs: &ExtraBlockDataTarget,
         rhs: &ExtraBlockDataTarget,
     ) {
-        // Connect genesis state root values.
+        // Connect checkpoint state root values.
         for (&limb0, &limb1) in pvs
-            .genesis_state_trie_root
+            .checkpoint_state_trie_root
             .iter()
-            .zip(&rhs.genesis_state_trie_root)
+            .zip(&rhs.checkpoint_state_trie_root)
         {
             builder.connect(limb0, limb1);
         }
         for (&limb0, &limb1) in pvs
-            .genesis_state_trie_root
+            .checkpoint_state_trie_root
             .iter()
-            .zip(&lhs.genesis_state_trie_root)
+            .zip(&lhs.checkpoint_state_trie_root)
         {
             builder.connect(limb0, limb1);
         }
@@ -667,26 +740,11 @@ where
         builder.connect(lhs.txn_number_after, rhs.txn_number_before);
 
         // Connect the gas used in public values to the lhs and rhs values correctly.
-        builder.connect(pvs.gas_used_before[0], lhs.gas_used_before[0]);
-        builder.connect(pvs.gas_used_before[1], lhs.gas_used_before[1]);
-        builder.connect(pvs.gas_used_after[0], rhs.gas_used_after[0]);
-        builder.connect(pvs.gas_used_after[1], rhs.gas_used_after[1]);
+        builder.connect(pvs.gas_used_before, lhs.gas_used_before);
+        builder.connect(pvs.gas_used_after, rhs.gas_used_after);
 
         // Connect lhs `gas_used_after` with rhs `gas_used_before`.
-        builder.connect(lhs.gas_used_after[0], rhs.gas_used_before[0]);
-        builder.connect(lhs.gas_used_after[1], rhs.gas_used_before[1]);
-
-        // Connect the `block_bloom` in public values to the lhs and rhs values correctly.
-        for (&limb0, &limb1) in pvs.block_bloom_after.iter().zip(&rhs.block_bloom_after) {
-            builder.connect(limb0, limb1);
-        }
-        for (&limb0, &limb1) in pvs.block_bloom_before.iter().zip(&lhs.block_bloom_before) {
-            builder.connect(limb0, limb1);
-        }
-        // Connect lhs `block_bloom_after` with rhs `block_bloom_before`.
-        for (&limb0, &limb1) in lhs.block_bloom_after.iter().zip(&rhs.block_bloom_before) {
-            builder.connect(limb0, limb1);
-        }
+        builder.connect(lhs.gas_used_after, rhs.gas_used_before);
     }
 
     fn add_agg_child(
@@ -733,6 +791,34 @@ where
         let parent_pv = PublicValuesTarget::from_public_inputs(&parent_block_proof.public_inputs);
         let agg_pv = PublicValuesTarget::from_public_inputs(&agg_root_proof.public_inputs);
 
+        // Connect block `trie_roots_before` with parent_pv `trie_roots_before`.
+        TrieRootsTarget::connect(
+            &mut builder,
+            public_values.trie_roots_before,
+            parent_pv.trie_roots_before,
+        );
+        // Connect the rest of block `public_values` with agg_pv.
+        TrieRootsTarget::connect(
+            &mut builder,
+            public_values.trie_roots_after,
+            agg_pv.trie_roots_after,
+        );
+        BlockMetadataTarget::connect(
+            &mut builder,
+            public_values.block_metadata,
+            agg_pv.block_metadata,
+        );
+        BlockHashesTarget::connect(
+            &mut builder,
+            public_values.block_hashes,
+            agg_pv.block_hashes,
+        );
+        ExtraBlockDataTarget::connect(
+            &mut builder,
+            public_values.extra_block_data,
+            agg_pv.extra_block_data,
+        );
+
         // Make connections between block proofs, and check initial and final block values.
         Self::connect_block_proof(&mut builder, has_parent_block, &parent_pv, &agg_pv);
 
@@ -760,7 +846,7 @@ where
     }
 
     /// Connect the 256 block hashes between two blocks
-    pub fn connect_block_hashes(
+    fn connect_block_hashes(
         builder: &mut CircuitBuilder<F, D>,
         lhs: &ProofWithPublicInputsTarget<D>,
         rhs: &ProofWithPublicInputsTarget<D>,
@@ -798,12 +884,12 @@ where
             builder.connect(limb0, limb1);
         }
 
-        // Between blocks, the genesis state trie remains unchanged.
+        // Between blocks, the checkpoint state trie remains unchanged.
         for (&limb0, limb1) in lhs
             .extra_block_data
-            .genesis_state_trie_root
+            .checkpoint_state_trie_root
             .iter()
-            .zip(rhs.extra_block_data.genesis_state_trie_root)
+            .zip(rhs.extra_block_data.checkpoint_state_trie_root)
         {
             builder.connect(limb0, limb1);
         }
@@ -821,15 +907,11 @@ where
 
         let has_not_parent_block = builder.sub(one, has_parent_block.target);
 
-        // Check that the genesis block number is 0.
-        let gen_block_constr = builder.mul(has_not_parent_block, rhs.block_metadata.block_number);
-        builder.assert_zero(gen_block_constr);
-
-        // Check that the genesis block has the predetermined state trie root in `ExtraBlockData`.
-        Self::connect_genesis_block(builder, rhs, has_not_parent_block);
+        // Check that the checkpoint block has the predetermined state trie root in `ExtraBlockData`.
+        Self::connect_checkpoint_block(builder, rhs, has_not_parent_block);
     }
 
-    fn connect_genesis_block(
+    fn connect_checkpoint_block(
         builder: &mut CircuitBuilder<F, D>,
         x: &PublicValuesTarget,
         has_not_parent_block: Target,
@@ -840,7 +922,7 @@ where
             .trie_roots_before
             .state_root
             .iter()
-            .zip(x.extra_block_data.genesis_state_trie_root)
+            .zip(x.extra_block_data.checkpoint_state_trie_root)
         {
             let mut constr = builder.sub(limb0, limb1);
             constr = builder.mul(has_not_parent_block, constr);
@@ -855,22 +937,9 @@ where
         F: RichField + Extendable<D>,
     {
         builder.connect(
-            x.block_metadata.block_gas_used[0],
-            x.extra_block_data.gas_used_after[0],
-        );
-        builder.connect(
-            x.block_metadata.block_gas_used[1],
-            x.extra_block_data.gas_used_after[1],
+            x.block_metadata.block_gas_used,
+            x.extra_block_data.gas_used_after,
         );
-
-        for (&limb0, &limb1) in x
-            .block_metadata
-            .block_bloom
-            .iter()
-            .zip(&x.extra_block_data.block_bloom_after)
-        {
-            builder.connect(limb0, limb1);
-        }
     }
 
     fn connect_initial_values_block(builder: &mut CircuitBuilder<F, D>, x: &PublicValuesTarget)
@@ -880,13 +949,7 @@ where
         // The initial number of transactions is 0.
         builder.assert_zero(x.extra_block_data.txn_number_before);
         // The initial gas used is 0.
-        builder.assert_zero(x.extra_block_data.gas_used_before[0]);
-        builder.assert_zero(x.extra_block_data.gas_used_before[1]);
-
-        // The initial bloom filter is all zeroes.
-        for t in x.extra_block_data.block_bloom_before {
-            builder.assert_zero(t);
-        }
+        builder.assert_zero(x.extra_block_data.gas_used_before);
 
         // The transactions and receipts tries are empty at the beginning of the block.
         let initial_trie = HashedPartialTrie::from(Node::Empty).hash();
@@ -898,15 +961,44 @@ where
         }
     }
 
-    /// Create a proof for each STARK, then combine them, eventually culminating in a root proof.
+    /// For a given transaction payload passed as [`GenerationInputs`], create a proof
+    /// for each STARK module, then recursively shrink and combine them, eventually
+    /// culminating in a transaction proof, also called root proof.
+    ///
+    /// # Arguments
+    ///
+    /// - `all_stark`: a structure defining the logic of all STARK modules and their associated
+    /// cross-table lookups.
+    /// - `config`: the configuration to be used for the STARK prover. It will usually be a fast
+    /// one yielding large proofs.
+    /// - `generation_inputs`: a transaction and auxiliary data needed to generate a proof, provided
+    /// in Intermediary Representation.
+    /// - `timing`: a profiler defining a scope hierarchy and the time consumed by each one.
+    /// - `abort_signal`: an optional [`AtomicBool`] wrapped behind an [`Arc`], to send a kill signal
+    /// early. This is only necessary in a distributed setting where a worker may be blocking the entire
+    /// queue.
+    ///
+    /// # Outputs
+    ///
+    /// This method outputs a tuple of [`ProofWithPublicInputs<F, C, D>`] and its [`PublicValues`]. Only
+    /// the proof with public inputs is necessary for a verifier to assert correctness of the computation,
+    /// but the public values are output for the prover convenience, as these are necessary during proof
+    /// aggregation.
     pub fn prove_root(
         &self,
         all_stark: &AllStark<F, D>,
         config: &StarkConfig,
         generation_inputs: GenerationInputs,
         timing: &mut TimingTree,
+        abort_signal: Option<Arc<AtomicBool>>,
     ) -> anyhow::Result<(ProofWithPublicInputs<F, C, D>, PublicValues)> {
-        let all_proof = prove::<F, C, D>(all_stark, config, generation_inputs, timing)?;
+        let all_proof = prove::<F, C, D>(
+            all_stark,
+            config,
+            generation_inputs,
+            timing,
+            abort_signal.clone(),
+        )?;
         let mut root_inputs = PartialWitness::new();
 
         for table in 0..NUM_TABLES {
@@ -917,7 +1009,7 @@ where
                 .by_stark_size
                 .get(&original_degree_bits)
                 .ok_or_else(|| {
-                    anyhow::Error::msg(format!(
+                    anyhow!(format!(
                         "Missing preprocessed circuits for {:?} table with size {}.",
                         Table::all()[table],
                         original_degree_bits,
@@ -934,6 +1026,97 @@ where
                 F::from_canonical_usize(index_verifier_data),
             );
             root_inputs.set_proof_with_pis_target(&self.root.proof_with_pis[table], &shrunk_proof);
+
+            check_abort_signal(abort_signal.clone())?;
+        }
+
+        root_inputs.set_verifier_data_target(
+            &self.root.cyclic_vk,
+            &self.aggregation.circuit.verifier_only,
+        );
+
+        set_public_value_targets(
+            &mut root_inputs,
+            &self.root.public_values,
+            &all_proof.public_values,
+        )
+        .map_err(|_| {
+            anyhow::Error::msg("Invalid conversion when setting public values targets.")
+        })?;
+
+        let root_proof = self.root.circuit.prove(root_inputs)?;
+
+        Ok((root_proof, all_proof.public_values))
+    }
+
+    /// From an initial set of STARK proofs passed with their associated recursive table circuits,
+    /// generate a recursive transaction proof.
+    /// It is aimed at being used when preprocessed table circuits have not been loaded to memory.
+    ///
+    /// **Note**:
+    /// The type of the `table_circuits` passed as arguments is
+    /// `&[(RecursiveCircuitsForTableSize<F, C, D>, u8); NUM_TABLES]`. In particular, for each STARK
+    /// proof contained within the `AllProof` object provided to this method, we need to pass a tuple
+    /// of [`RecursiveCircuitsForTableSize<F, C, D>`] and a [`u8`]. The former is the recursive chain
+    /// corresponding to the initial degree size of the associated STARK proof. The latter is the
+    /// index of this degree in the range that was originally passed when constructing the entire prover
+    /// state.
+    ///
+    /// # Usage
+    ///
+    /// ```ignore
+    /// // Load a prover state without its recursive table circuits.
+    /// let gate_serializer = DefaultGateSerializer;
+    /// let generator_serializer = DefaultGeneratorSerializer::<C, D>::new();
+    /// let initial_ranges = [16..25, 10..20, 12..25, 14..25, 9..20, 12..20, 17..30];
+    /// let prover_state = AllRecursiveCircuits::<F, C, D>::new(
+    ///     &all_stark,
+    ///     &initial_ranges,
+    ///     &config,
+    /// );
+    ///
+    /// // Generate a proof from the provided inputs.
+    /// let stark_proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, abort_signal).unwrap();
+    ///
+    /// // Read the degrees of the internal STARK proofs.
+    /// // Indices to be passed along the recursive tables
+    /// // can be easily recovered as `initial_ranges[i]` - `degrees[i]`.
+    /// let degrees = proof.degree_bits(&config);
+    ///
+    /// // Retrieve the corresponding recursive table circuits for each table with the corresponding degree.
+    /// let table_circuits = { ... };
+    ///
+    /// // Finally shrink the STARK proof.
+    /// let (proof, public_values) = prove_root_after_initial_stark(
+    ///     &all_stark,
+    ///     &config,
+    ///     &stark_proof,
+    ///     &table_circuits,
+    ///     &mut timing,
+    ///     abort_signal,
+    /// ).unwrap();
+    /// ```
+    pub fn prove_root_after_initial_stark(
+        &self,
+        all_proof: AllProof<F, C, D>,
+        table_circuits: &[(RecursiveCircuitsForTableSize<F, C, D>, u8); NUM_TABLES],
+        abort_signal: Option<Arc<AtomicBool>>,
+    ) -> anyhow::Result<(ProofWithPublicInputs<F, C, D>, PublicValues)> {
+        let mut root_inputs = PartialWitness::new();
+
+        for table in 0..NUM_TABLES {
+            let (table_circuit, index_verifier_data) = &table_circuits[table];
+
+            let stark_proof = &all_proof.stark_proofs[table];
+
+            let shrunk_proof = table_circuit.shrink(stark_proof, &all_proof.ctl_challenges)?;
+            root_inputs.set_target(
+                self.root.index_verifier_data[table],
+                F::from_canonical_u8(*index_verifier_data),
+            );
+            root_inputs.set_proof_with_pis_target(&self.root.proof_with_pis[table], &shrunk_proof);
+
+            check_abort_signal(abort_signal.clone())?;
         }
 
         root_inputs.set_verifier_data_target(
@@ -959,13 +1142,39 @@ where
         self.root.circuit.verify(agg_proof)
     }
 
+    /// Create an aggregation proof, combining two contiguous proofs into a single one. The combined
+    /// proofs can either be transaction (aka root) proofs, or other aggregation proofs, as long as
+    /// their states are contiguous, meaning that the final state of the left child proof is the initial
+    /// state of the right child proof.
+    ///
+    /// While regular transaction proofs can only assert validity of a single transaction, aggregation
+    /// proofs can cover an arbitrary range, up to an entire block with all its transactions.
+    ///
+    /// # Arguments
+    ///
+    /// - `lhs_is_agg`: a boolean indicating whether the left child proof is an aggregation proof or
+    /// a regular transaction proof.
+    /// - `lhs_proof`: the left child proof.
+    /// - `lhs_public_values`: the public values associated to the right child proof.
+    /// - `rhs_is_agg`: a boolean indicating whether the right child proof is an aggregation proof or
+    /// a regular transaction proof.
+    /// - `rhs_proof`: the right child proof.
+    /// - `rhs_public_values`: the public values associated to the right child proof.
+    ///
+    /// # Outputs
+    ///
+    /// This method outputs a tuple of [`ProofWithPublicInputs<F, C, D>`] and its [`PublicValues`]. Only
+    /// the proof with public inputs is necessary for a verifier to assert correctness of the computation,
+    /// but the public values are output for the prover convenience, as these are necessary during proof
+    /// aggregation.
     pub fn prove_aggregation(
         &self,
         lhs_is_agg: bool,
         lhs_proof: &ProofWithPublicInputs<F, C, D>,
+        lhs_public_values: PublicValues,
         rhs_is_agg: bool,
         rhs_proof: &ProofWithPublicInputs<F, C, D>,
-        public_values: PublicValues,
+        rhs_public_values: PublicValues,
     ) -> anyhow::Result<(ProofWithPublicInputs<F, C, D>, PublicValues)> {
         let mut agg_inputs = PartialWitness::new();
 
@@ -982,17 +1191,34 @@ where
             &self.aggregation.circuit.verifier_only,
         );
 
+        // Aggregates both `PublicValues` from the provided proofs into a single one.
+        let agg_public_values = PublicValues {
+            trie_roots_before: lhs_public_values.trie_roots_before,
+            trie_roots_after: rhs_public_values.trie_roots_after,
+            extra_block_data: ExtraBlockData {
+                checkpoint_state_trie_root: lhs_public_values
+                    .extra_block_data
+                    .checkpoint_state_trie_root,
+                txn_number_before: lhs_public_values.extra_block_data.txn_number_before,
+                txn_number_after: rhs_public_values.extra_block_data.txn_number_after,
+                gas_used_before: lhs_public_values.extra_block_data.gas_used_before,
+                gas_used_after: rhs_public_values.extra_block_data.gas_used_after,
+            },
+            block_metadata: rhs_public_values.block_metadata,
+            block_hashes: rhs_public_values.block_hashes,
+        };
+
         set_public_value_targets(
             &mut agg_inputs,
             &self.aggregation.public_values,
-            &public_values,
+            &agg_public_values,
         )
         .map_err(|_| {
             anyhow::Error::msg("Invalid conversion when setting public values targets.")
         })?;
 
         let aggregation_proof = self.aggregation.circuit.prove(agg_inputs)?;
-        Ok((aggregation_proof, public_values))
+        Ok((aggregation_proof, agg_public_values))
     }
 
     pub fn verify_aggregation(
@@ -1007,6 +1233,23 @@ where
         )
     }
 
+    /// Create a final block proof, once all transactions of a given block have been combined into a
+    /// single aggregation proof.
+    ///
+    /// Block proofs can either be generated as standalone, or combined with a previous block proof
+    /// to assert validity of a range of blocks.
+    ///
+    /// # Arguments
+    ///
+    /// - `opt_parent_block_proof`: an optional parent block proof. Passing one will generate a proof of
+    /// validity for both the block range covered by the previous proof and the current block.
+    /// - `agg_root_proof`: the final aggregation proof containing all transactions within the current block.
+    /// - `public_values`: the public values associated to the aggregation proof.
+    ///
+    /// # Outputs
+    ///
+    /// This method outputs a tuple of [`ProofWithPublicInputs<F, C, D>`] and its [`PublicValues`]. Only
+    /// the proof with public inputs is necessary for a verifier to assert correctness of the computation.
     pub fn prove_block(
         &self,
         opt_parent_block_proof: Option<&ProofWithPublicInputs<F, C, D>>,
@@ -1023,33 +1266,90 @@ where
             block_inputs
                 .set_proof_with_pis_target(&self.block.parent_block_proof, parent_block_proof);
         } else {
-            // Initialize genesis_state_trie, state_root_after and the block number for correct connection between blocks.
-            // Initialize `state_root_after`.
-            let state_trie_root_after_keys = 24..32;
+            if public_values.trie_roots_before.state_root
+                != public_values.extra_block_data.checkpoint_state_trie_root
+            {
+                return Err(anyhow::Error::msg(format!(
+                    "Inconsistent pre-state for first block {:?} with checkpoint state {:?}.",
+                    public_values.trie_roots_before.state_root,
+                    public_values.extra_block_data.checkpoint_state_trie_root,
+                )));
+            }
+
+            // Initialize some public inputs for correct connection between the checkpoint block and the current one.
             let mut nonzero_pis = HashMap::new();
+
+            // Initialize the checkpoint block roots before, and state root after.
+            let state_trie_root_before_keys = 0..TrieRootsTarget::HASH_SIZE;
+            for (key, &value) in state_trie_root_before_keys
+                .zip_eq(&h256_limbs::<F>(public_values.trie_roots_before.state_root))
+            {
+                nonzero_pis.insert(key, value);
+            }
+            let txn_trie_root_before_keys =
+                TrieRootsTarget::HASH_SIZE..TrieRootsTarget::HASH_SIZE * 2;
+            for (key, &value) in txn_trie_root_before_keys.clone().zip_eq(&h256_limbs::<F>(
+                public_values.trie_roots_before.transactions_root,
+            )) {
+                nonzero_pis.insert(key, value);
+            }
+            let receipts_trie_root_before_keys =
+                TrieRootsTarget::HASH_SIZE * 2..TrieRootsTarget::HASH_SIZE * 3;
+            for (key, &value) in receipts_trie_root_before_keys
+                .clone()
+                .zip_eq(&h256_limbs::<F>(
+                    public_values.trie_roots_before.receipts_root,
+                ))
+            {
+                nonzero_pis.insert(key, value);
+            }
+            let state_trie_root_after_keys =
+                TrieRootsTarget::SIZE..TrieRootsTarget::SIZE + TrieRootsTarget::HASH_SIZE;
             for (key, &value) in state_trie_root_after_keys
                 .zip_eq(&h256_limbs::<F>(public_values.trie_roots_before.state_root))
             {
                 nonzero_pis.insert(key, value);
             }
 
-            // Initialize the genesis state trie digest.
-            let genesis_state_trie_keys = TrieRootsTarget::SIZE * 2
-                + BlockMetadataTarget::SIZE
-                + BlockHashesTarget::BLOCK_HASHES_SIZE
-                ..TrieRootsTarget::SIZE * 2
-                    + BlockMetadataTarget::SIZE
-                    + BlockHashesTarget::BLOCK_HASHES_SIZE
-                    + 8;
-            for (key, &value) in genesis_state_trie_keys.zip_eq(&h256_limbs::<F>(
-                public_values.extra_block_data.genesis_state_trie_root,
+            // Initialize the checkpoint state root extra data.
+            let checkpoint_state_trie_keys =
+                TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE + BlockHashesTarget::SIZE
+                    ..TrieRootsTarget::SIZE * 2
+                        + BlockMetadataTarget::SIZE
+                        + BlockHashesTarget::SIZE
+                        + 8;
+            for (key, &value) in checkpoint_state_trie_keys.zip_eq(&h256_limbs::<F>(
+                public_values.extra_block_data.checkpoint_state_trie_root,
             )) {
                 nonzero_pis.insert(key, value);
             }
 
-            // Initialize the block number.
+            // Initialize checkpoint block hashes.
+            // These will be all zeros the initial genesis checkpoint.
+            let block_hashes_keys = TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE
+                ..TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE + BlockHashesTarget::SIZE
+                    - 8;
+
+            for i in 0..public_values.block_hashes.prev_hashes.len() - 1 {
+                let targets = h256_limbs::<F>(public_values.block_hashes.prev_hashes[i]);
+                for j in 0..8 {
+                    nonzero_pis.insert(block_hashes_keys.start + 8 * (i + 1) + j, targets[j]);
+                }
+            }
+            let block_hashes_current_start =
+                TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE + BlockHashesTarget::SIZE - 8;
+            let cur_targets = h256_limbs::<F>(public_values.block_hashes.prev_hashes[255]);
+            for i in 0..8 {
+                nonzero_pis.insert(block_hashes_current_start + i, cur_targets[i]);
+            }
+
+            // Initialize the checkpoint block number.
+            // Subtraction would result in an invalid proof for genesis, but we shouldn't try proving this block anyway.
             let block_number_key = TrieRootsTarget::SIZE * 2 + 6;
-            nonzero_pis.insert(block_number_key, F::NEG_ONE);
+            nonzero_pis.insert(
+                block_number_key,
+                F::from_canonical_u64(public_values.block_metadata.block_number.low_u64() - 1),
+            );
 
             block_inputs.set_proof_with_pis_target(
                 &self.block.parent_block_proof,
@@ -1066,13 +1366,26 @@ where
         block_inputs
             .set_verifier_data_target(&self.block.cyclic_vk, &self.block.circuit.verifier_only);
 
-        set_public_value_targets(&mut block_inputs, &self.block.public_values, &public_values)
-            .map_err(|_| {
-                anyhow::Error::msg("Invalid conversion when setting public values targets.")
-            })?;
+        // This is basically identical to this block public values, apart from the `trie_roots_before`
+        // that may come from the previous proof, if any.
+        let block_public_values = PublicValues {
+            trie_roots_before: opt_parent_block_proof
+                .map(|p| TrieRoots::from_public_inputs(&p.public_inputs[0..TrieRootsTarget::SIZE]))
+                .unwrap_or(public_values.trie_roots_before),
+            ..public_values
+        };
+
+        set_public_value_targets(
+            &mut block_inputs,
+            &self.block.public_values,
+            &block_public_values,
+        )
+        .map_err(|_| {
+            anyhow::Error::msg("Invalid conversion when setting public values targets.")
+        })?;
 
         let block_proof = self.block.circuit.prove(block_inputs)?;
-        Ok((block_proof, public_values))
+        Ok((block_proof, block_public_values))
     }
 
     pub fn verify_block(&self, block_proof: &ProofWithPublicInputs<F, C, D>) -> anyhow::Result<()> {
@@ -1085,6 +1398,7 @@ where
     }
 }
 
+/// A map between initial degree sizes and their associated shrinking recursion circuits.
 #[derive(Eq, PartialEq, Debug)]
 pub struct RecursiveCircuitsForTable<F, C, const D: usize>
 where
@@ -1094,7 +1408,7 @@ where
 {
     /// A map from `log_2(height)` to a chain of shrinking recursion circuits starting at that
     /// height.
-    by_stark_size: BTreeMap<usize, RecursiveCircuitsForTableSize<F, C, D>>,
+    pub by_stark_size: BTreeMap<usize, RecursiveCircuitsForTableSize<F, C, D>>,
 }
 
 impl<F, C, const D: usize> RecursiveCircuitsForTable<F, C, D>
@@ -1103,7 +1417,7 @@ where
     C: GenericConfig<D, F = F>,
     C::Hasher: AlgebraicHasher<F>,
 {
-    pub fn to_buffer(
+    fn to_buffer(
         &self,
         buffer: &mut Vec<u8>,
         gate_serializer: &dyn GateSerializer<F, D>,
@@ -1117,7 +1431,7 @@ where
         Ok(())
     }
 
-    pub fn from_buffer(
+    fn from_buffer(
         buffer: &mut Buffer,
         gate_serializer: &dyn GateSerializer<F, D>,
         generator_serializer: &dyn WitnessGeneratorSerializer<F, D>,
@@ -1179,7 +1493,7 @@ where
 /// A chain of shrinking wrapper circuits, ending with a final circuit with `degree_bits`
 /// `THRESHOLD_DEGREE_BITS`.
 #[derive(Eq, PartialEq, Debug)]
-struct RecursiveCircuitsForTableSize<F, C, const D: usize>
+pub struct RecursiveCircuitsForTableSize<F, C, const D: usize>
 where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
@@ -1313,7 +1627,7 @@ where
         }
     }
 
-    fn shrink(
+    pub fn shrink(
         &self,
         stark_proof_with_metadata: &StarkProofWithMetadata<F, C, D>,
         ctl_challenges: &GrandProductChallengeSet<F>,
diff --git a/evm/src/generation/mod.rs b/evm/src/generation/mod.rs
index 49c0aebaff..3c7a9c6050 100644
--- a/evm/src/generation/mod.rs
+++ b/evm/src/generation/mod.rs
@@ -1,10 +1,11 @@
-use std::collections::HashMap;
+use std::collections::{BTreeSet, HashMap};
 
 use anyhow::anyhow;
 use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
 use ethereum_types::{Address, BigEndianHash, H256, U256};
 use plonky2::field::extension::Extendable;
 use plonky2::field::polynomial::PolynomialValues;
+use plonky2::field::types::Field;
 use plonky2::hash::hash_types::RichField;
 use plonky2::timed;
 use plonky2::util::timing::TimingTree;
@@ -16,57 +17,58 @@ use GlobalMetadata::{
 
 use crate::all_stark::{AllStark, NUM_TABLES};
 use crate::config::StarkConfig;
-use crate::cpu::bootstrap_kernel::generate_bootstrap_kernel;
 use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
-use crate::generation::outputs::{get_outputs, GenerationOutputs};
 use crate::generation::state::GenerationState;
+use crate::generation::trie_extractor::{get_receipt_trie, get_state_trie, get_txn_trie};
 use crate::memory::segments::Segment;
 use crate::proof::{BlockHashes, BlockMetadata, ExtraBlockData, PublicValues, TrieRoots};
-use crate::util::h2u;
+use crate::util::{h2u, u256_to_u8, u256_to_usize};
 use crate::witness::memory::{MemoryAddress, MemoryChannel};
 use crate::witness::transition::transition;
 
 pub mod mpt;
-pub mod outputs;
 pub(crate) mod prover_input;
 pub(crate) mod rlp;
 pub(crate) mod state;
 mod trie_extractor;
 
-use crate::witness::util::mem_write_log;
+use crate::witness::util::{mem_write_log, stack_peek};
 
 /// Inputs needed for trace generation.
 #[derive(Clone, Debug, Deserialize, Serialize, Default)]
 pub struct GenerationInputs {
+    /// The index of the transaction being proven within its block.
     pub txn_number_before: U256,
+    /// The cumulative gas used through the execution of all transactions prior the current one.
     pub gas_used_before: U256,
-    pub block_bloom_before: [U256; 8],
+    /// The cumulative gas used after the execution of the current transaction. The exact gas used
+    /// by the current transaction is `gas_used_after` - `gas_used_before`.
     pub gas_used_after: U256,
-    pub block_bloom_after: [U256; 8],
 
-    pub signed_txns: Vec<Vec<u8>>,
+    /// A None would yield an empty proof, otherwise this contains the encoding of a transaction.
+    pub signed_txn: Option<Vec<u8>>,
+    /// Withdrawal pairs `(addr, amount)`. At the end of the txs, `amount` is added to `addr`'s balance. See EIP-4895.
+    pub withdrawals: Vec<(Address, U256)>,
     pub tries: TrieInputs,
     /// Expected trie roots after the transactions are executed.
     pub trie_roots_after: TrieRoots,
-    /// State trie root of the genesis block.
-    pub genesis_state_trie_root: H256,
+
+    /// State trie root of the checkpoint block.
+    /// This could always be the genesis block of the chain, but it allows a prover to continue proving blocks
+    /// from certain checkpoint heights without requiring proofs for blocks past this checkpoint.
+    pub checkpoint_state_trie_root: H256,
 
     /// Mapping between smart contract code hashes and the contract byte code.
     /// All account smart contracts that are invoked will have an entry present.
     pub contract_code: HashMap<H256, Vec<u8>>,
 
+    /// Information contained in the block header.
     pub block_metadata: BlockMetadata,
 
+    /// The hash of the current block, and a list of the 256 previous block hashes.
     pub block_hashes: BlockHashes,
-
-    /// A list of known addresses in the input state trie (which itself doesn't hold addresses,
-    /// only state keys). This is only useful for debugging, so that we can return addresses in the
-    /// post-state rather than state keys. (See `GenerationOutputs`, and in particular
-    /// `AddressOrStateKey`.) If the caller is not interested in the post-state, this can be left
-    /// empty.
-    pub addresses: Vec<Address>,
 }
 
 #[derive(Clone, Debug, Deserialize, Serialize, Default)]
@@ -124,7 +126,7 @@ fn apply_metadata_and_tries_memops<F: RichField + Extendable<D>, const D: usize>
         (GlobalMetadata::TxnNumberBefore, inputs.txn_number_before),
         (
             GlobalMetadata::TxnNumberAfter,
-            inputs.txn_number_before + inputs.signed_txns.len(),
+            inputs.txn_number_before + if inputs.signed_txn.is_some() { 1 } else { 0 },
         ),
         (
             GlobalMetadata::StateTrieRootDigestBefore,
@@ -150,6 +152,8 @@ fn apply_metadata_and_tries_memops<F: RichField + Extendable<D>, const D: usize>
             GlobalMetadata::ReceiptTrieRootDigestAfter,
             h2u(trie_roots_after.receipts_root),
         ),
+        (GlobalMetadata::KernelHash, h2u(KERNEL.code_hash)),
+        (GlobalMetadata::KernelLen, KERNEL.code.len().into()),
     ];
 
     let channel = MemoryChannel::GeneralPurpose(0);
@@ -157,7 +161,8 @@ fn apply_metadata_and_tries_memops<F: RichField + Extendable<D>, const D: usize>
         .map(|(field, val)| {
             mem_write_log(
                 channel,
-                MemoryAddress::new(0, Segment::GlobalMetadata, field as usize),
+                // These fields are already scaled by their segment, and are in context 0 (kernel).
+                MemoryAddress::new_bundle(U256::from(field as usize)).unwrap(),
                 state,
                 val,
             )
@@ -173,32 +178,7 @@ fn apply_metadata_and_tries_memops<F: RichField + Extendable<D>, const D: usize>
             metadata.block_bloom[i],
         )
     }));
-    // Write the block's bloom filter before the current transaction.
-    ops.extend(
-        (0..8)
-            .map(|i| {
-                mem_write_log(
-                    channel,
-                    MemoryAddress::new(0, Segment::GlobalBlockBloom, i + 8),
-                    state,
-                    inputs.block_bloom_before[i],
-                )
-            })
-            .collect::<Vec<_>>(),
-    );
-    // Write the block's bloom filter after the current transaction.
-    ops.extend(
-        (0..8)
-            .map(|i| {
-                mem_write_log(
-                    channel,
-                    MemoryAddress::new(0, Segment::GlobalBlockBloom, i + 16),
-                    state,
-                    inputs.block_bloom_after[i],
-                )
-            })
-            .collect::<Vec<_>>(),
-    );
+
     // Write previous block hashes.
     ops.extend(
         (0..256)
@@ -222,33 +202,67 @@ pub fn generate_traces<F: RichField + Extendable<D>, const D: usize>(
     inputs: GenerationInputs,
     config: &StarkConfig,
     timing: &mut TimingTree,
-) -> anyhow::Result<(
-    [Vec<PolynomialValues<F>>; NUM_TABLES],
-    PublicValues,
-    GenerationOutputs,
-)> {
+) -> anyhow::Result<([Vec<PolynomialValues<F>>; NUM_TABLES], PublicValues)> {
     let mut state = GenerationState::<F>::new(inputs.clone(), &KERNEL.code)
         .map_err(|err| anyhow!("Failed to parse all the initial prover inputs: {:?}", err))?;
 
     apply_metadata_and_tries_memops(&mut state, &inputs);
 
-    generate_bootstrap_kernel::<F>(&mut state);
-
-    timed!(timing, "simulate CPU", simulate_cpu(&mut state)?);
+    let cpu_res = timed!(timing, "simulate CPU", simulate_cpu(&mut state));
+    if cpu_res.is_err() {
+        // Retrieve previous PC (before jumping to KernelPanic), to see if we reached `hash_final_tries`.
+        // We will output debugging information on the final tries only if we got a root mismatch.
+        let previous_pc = state
+            .traces
+            .cpu
+            .last()
+            .expect("We should have CPU rows")
+            .program_counter
+            .to_canonical_u64() as usize;
+
+        if KERNEL.offset_name(previous_pc).contains("hash_final_tries") {
+            let state_trie_ptr = u256_to_usize(
+                state
+                    .memory
+                    .read_global_metadata(GlobalMetadata::StateTrieRoot),
+            )
+            .map_err(|_| anyhow!("State trie pointer is too large to fit in a usize."))?;
+            log::debug!(
+                "Computed state trie: {:?}",
+                get_state_trie::<HashedPartialTrie>(&state.memory, state_trie_ptr)
+            );
+
+            let txn_trie_ptr = u256_to_usize(
+                state
+                    .memory
+                    .read_global_metadata(GlobalMetadata::TransactionTrieRoot),
+            )
+            .map_err(|_| anyhow!("Transactions trie pointer is too large to fit in a usize."))?;
+            log::debug!(
+                "Computed transactions trie: {:?}",
+                get_txn_trie::<HashedPartialTrie>(&state.memory, txn_trie_ptr)
+            );
+
+            let receipt_trie_ptr = u256_to_usize(
+                state
+                    .memory
+                    .read_global_metadata(GlobalMetadata::ReceiptTrieRoot),
+            )
+            .map_err(|_| anyhow!("Receipts trie pointer is too large to fit in a usize."))?;
+            log::debug!(
+                "Computed receipts trie: {:?}",
+                get_receipt_trie::<HashedPartialTrie>(&state.memory, receipt_trie_ptr)
+            );
+        }
 
-    assert!(
-        state.mpt_prover_inputs.is_empty(),
-        "All MPT data should have been consumed"
-    );
+        cpu_res?;
+    }
 
     log::info!(
         "Trace lengths (before padding): {:?}",
         state.traces.get_lengths()
     );
 
-    let outputs = get_outputs(&mut state)
-        .map_err(|err| anyhow!("Failed to generate post-state info: {:?}", err))?;
-
     let read_metadata = |field| state.memory.read_global_metadata(field);
     let trie_roots_before = TrieRoots {
         state_root: H256::from_uint(&read_metadata(StateTrieRootDigestBefore)),
@@ -265,13 +279,11 @@ pub fn generate_traces<F: RichField + Extendable<D>, const D: usize>(
     let txn_number_after = read_metadata(GlobalMetadata::TxnNumberAfter);
 
     let extra_block_data = ExtraBlockData {
-        genesis_state_trie_root: inputs.genesis_state_trie_root,
+        checkpoint_state_trie_root: inputs.checkpoint_state_trie_root,
         txn_number_before: inputs.txn_number_before,
         txn_number_after,
         gas_used_before: inputs.gas_used_before,
         gas_used_after,
-        block_bloom_before: inputs.block_bloom_before,
-        block_bloom_after: inputs.block_bloom_after,
     };
 
     let public_values = PublicValues {
@@ -287,12 +299,10 @@ pub fn generate_traces<F: RichField + Extendable<D>, const D: usize>(
         "convert trace data to tables",
         state.traces.into_tables(all_stark, config, timing)
     );
-    Ok((tables, public_values, outputs))
+    Ok((tables, public_values))
 }
 
-fn simulate_cpu<F: RichField + Extendable<D>, const D: usize>(
-    state: &mut GenerationState<F>,
-) -> anyhow::Result<()> {
+fn simulate_cpu<F: Field>(state: &mut GenerationState<F>) -> anyhow::Result<()> {
     let halt_pc = KERNEL.global_labels["halt"];
 
     loop {
@@ -308,10 +318,7 @@ fn simulate_cpu<F: RichField + Extendable<D>, const D: usize>(
             row.context = F::from_canonical_usize(state.registers.context);
             row.program_counter = F::from_canonical_usize(pc);
             row.is_kernel_mode = F::ONE;
-            row.gas = [
-                F::from_canonical_u32(state.registers.gas_used as u32),
-                F::from_canonical_u32((state.registers.gas_used >> 32) as u32),
-            ];
+            row.gas = F::from_canonical_u64(state.registers.gas_used);
             row.stack_len = F::from_canonical_usize(state.registers.stack_len);
 
             loop {
@@ -321,6 +328,7 @@ fn simulate_cpu<F: RichField + Extendable<D>, const D: usize>(
                     break;
                 }
             }
+
             log::info!("CPU trace padded to {} cycles", state.traces.clock());
 
             return Ok(());
@@ -329,3 +337,87 @@ fn simulate_cpu<F: RichField + Extendable<D>, const D: usize>(
         transition(state)?;
     }
 }
+
+fn simulate_cpu_between_labels_and_get_user_jumps<F: Field>(
+    initial_label: &str,
+    final_label: &str,
+    state: &mut GenerationState<F>,
+) -> Option<HashMap<usize, BTreeSet<usize>>> {
+    if state.jumpdest_table.is_some() {
+        None
+    } else {
+        const JUMP_OPCODE: u8 = 0x56;
+        const JUMPI_OPCODE: u8 = 0x57;
+
+        let halt_pc = KERNEL.global_labels[final_label];
+        let mut jumpdest_addresses: HashMap<_, BTreeSet<usize>> = HashMap::new();
+
+        state.registers.program_counter = KERNEL.global_labels[initial_label];
+        let initial_clock = state.traces.clock();
+        let initial_context = state.registers.context;
+
+        log::debug!("Simulating CPU for jumpdest analysis.");
+
+        loop {
+            // skip jumpdest table validations in simulations
+            if state.registers.is_kernel
+                && state.registers.program_counter == KERNEL.global_labels["jumpdest_analysis"]
+            {
+                state.registers.program_counter = KERNEL.global_labels["jumpdest_analysis_end"]
+            }
+            let pc = state.registers.program_counter;
+            let context = state.registers.context;
+            let halt = state.registers.is_kernel
+                && pc == halt_pc
+                && state.registers.context == initial_context;
+            let Ok(opcode) = u256_to_u8(state.memory.get(MemoryAddress::new(
+                context,
+                Segment::Code,
+                state.registers.program_counter,
+            ))) else {
+                log::debug!(
+                    "Simulated CPU for jumpdest analysis halted after {} cycles",
+                    state.traces.clock() - initial_clock
+                );
+                return Some(jumpdest_addresses);
+            };
+            let cond = if let Ok(cond) = stack_peek(state, 1) {
+                cond != U256::zero()
+            } else {
+                false
+            };
+            if !state.registers.is_kernel
+                && (opcode == JUMP_OPCODE || (opcode == JUMPI_OPCODE && cond))
+            {
+                // Avoid deeper calls to abort
+                let Ok(jumpdest) = u256_to_usize(state.registers.stack_top) else {
+                    log::debug!(
+                        "Simulated CPU for jumpdest analysis halted after {} cycles",
+                        state.traces.clock() - initial_clock
+                    );
+                    return Some(jumpdest_addresses);
+                };
+                state.memory.set(
+                    MemoryAddress::new(context, Segment::JumpdestBits, jumpdest),
+                    U256::one(),
+                );
+                let jumpdest_opcode =
+                    state
+                        .memory
+                        .get(MemoryAddress::new(context, Segment::Code, jumpdest));
+                if let Some(ctx_addresses) = jumpdest_addresses.get_mut(&context) {
+                    ctx_addresses.insert(jumpdest);
+                } else {
+                    jumpdest_addresses.insert(context, BTreeSet::from([jumpdest]));
+                }
+            }
+            if halt || transition(state).is_err() {
+                log::debug!(
+                    "Simulated CPU for jumpdest analysis halted after {} cycles",
+                    state.traces.clock() - initial_clock
+                );
+                return Some(jumpdest_addresses);
+            }
+        }
+    }
+}
diff --git a/evm/src/generation/mpt.rs b/evm/src/generation/mpt.rs
index 20e8b30b60..ee530ddef5 100644
--- a/evm/src/generation/mpt.rs
+++ b/evm/src/generation/mpt.rs
@@ -1,5 +1,5 @@
+use core::ops::Deref;
 use std::collections::HashMap;
-use std::ops::Deref;
 
 use bytes::Bytes;
 use eth_trie_utils::nibbles::Nibbles;
@@ -11,6 +11,7 @@ use rlp_derive::{RlpDecodable, RlpEncodable};
 
 use crate::cpu::kernel::constants::trie_type::PartialTrieType;
 use crate::generation::TrieInputs;
+use crate::util::h2u;
 use crate::witness::errors::{ProgramError, ProverInputError};
 use crate::Node;
 
@@ -22,6 +23,13 @@ pub struct AccountRlp {
     pub code_hash: H256,
 }
 
+#[derive(Clone, Debug)]
+pub struct TrieRootPtrs {
+    pub state_root_ptr: usize,
+    pub txn_root_ptr: usize,
+    pub receipt_root_ptr: usize,
+}
+
 impl Default for AccountRlp {
     fn default() -> Self {
         Self {
@@ -48,19 +56,36 @@ pub struct LegacyReceiptRlp {
     pub logs: Vec<LogRlp>,
 }
 
-pub(crate) fn all_mpt_prover_inputs_reversed(
-    trie_inputs: &TrieInputs,
-) -> Result<Vec<U256>, ProgramError> {
-    let mut inputs = all_mpt_prover_inputs(trie_inputs)?;
-    inputs.reverse();
-    Ok(inputs)
+impl LegacyReceiptRlp {
+    // RLP encode the receipt and prepend the tx type.
+    pub fn encode(&self, tx_type: u8) -> Vec<u8> {
+        let mut bytes = rlp::encode(self).to_vec();
+        if tx_type != 0 {
+            bytes.insert(0, tx_type);
+        }
+        bytes
+    }
 }
 
 pub(crate) fn parse_receipts(rlp: &[u8]) -> Result<Vec<U256>, ProgramError> {
+    let txn_type = match rlp.first().ok_or(ProgramError::InvalidRlp)? {
+        1 => 1,
+        2 => 2,
+        _ => 0,
+    };
+
+    // If this is not a legacy transaction, we skip the leading byte.
+    let rlp = if txn_type == 0 { rlp } else { &rlp[1..] };
+
     let payload_info = PayloadInfo::from(rlp).map_err(|_| ProgramError::InvalidRlp)?;
     let decoded_receipt: LegacyReceiptRlp =
         rlp::decode(rlp).map_err(|_| ProgramError::InvalidRlp)?;
-    let mut parsed_receipt = Vec::new();
+
+    let mut parsed_receipt = if txn_type == 0 {
+        Vec::new()
+    } else {
+        vec![txn_type.into()]
+    };
 
     parsed_receipt.push(payload_info.value_len.into()); // payload_len of the entire receipt
     parsed_receipt.push((decoded_receipt.status as u8).into());
@@ -86,113 +111,114 @@ pub(crate) fn parse_receipts(rlp: &[u8]) -> Result<Vec<U256>, ProgramError> {
 
     Ok(parsed_receipt)
 }
-/// Generate prover inputs for the initial MPT data, in the format expected by `mpt/load.asm`.
-pub(crate) fn all_mpt_prover_inputs(trie_inputs: &TrieInputs) -> Result<Vec<U256>, ProgramError> {
-    let mut prover_inputs = vec![];
-
-    let storage_tries_by_state_key = trie_inputs
-        .storage_tries
-        .iter()
-        .map(|(hashed_address, storage_trie)| {
-            let key = Nibbles::from_bytes_be(hashed_address.as_bytes()).unwrap();
-            (key, storage_trie)
-        })
-        .collect();
-
-    mpt_prover_inputs_state_trie(
-        &trie_inputs.state_trie,
-        empty_nibbles(),
-        &mut prover_inputs,
-        &storage_tries_by_state_key,
-    )?;
-
-    mpt_prover_inputs(&trie_inputs.transactions_trie, &mut prover_inputs, &|rlp| {
-        let mut parsed_txn = vec![U256::from(rlp.len())];
-        parsed_txn.extend(rlp.iter().copied().map(U256::from));
-        Ok(parsed_txn)
-    })?;
 
-    mpt_prover_inputs(
-        &trie_inputs.receipts_trie,
-        &mut prover_inputs,
-        &parse_receipts,
-    )?;
+fn parse_storage_value(value_rlp: &[u8]) -> Result<Vec<U256>, ProgramError> {
+    let value: U256 = rlp::decode(value_rlp).map_err(|_| ProgramError::InvalidRlp)?;
+    Ok(vec![value])
+}
 
-    Ok(prover_inputs)
+const fn empty_nibbles() -> Nibbles {
+    Nibbles {
+        count: 0,
+        packed: U512::zero(),
+    }
 }
 
-/// Given a trie, generate the prover input data for that trie. In essence, this serializes a trie
-/// into a `U256` array, in a simple format which the kernel understands. For example, a leaf node
-/// is serialized as `(TYPE_LEAF, key, value)`, where key is a `(nibbles, depth)` pair and `value`
-/// is a variable-length structure which depends on which trie we're dealing with.
-pub(crate) fn mpt_prover_inputs<F>(
+fn load_mpt<F>(
     trie: &HashedPartialTrie,
-    prover_inputs: &mut Vec<U256>,
+    trie_data: &mut Vec<U256>,
     parse_value: &F,
-) -> Result<(), ProgramError>
+) -> Result<usize, ProgramError>
 where
     F: Fn(&[u8]) -> Result<Vec<U256>, ProgramError>,
 {
-    prover_inputs.push((PartialTrieType::of(trie) as u32).into());
+    let node_ptr = trie_data.len();
+    let type_of_trie = PartialTrieType::of(trie) as u32;
+    if type_of_trie > 0 {
+        trie_data.push(type_of_trie.into());
+    }
 
     match trie.deref() {
-        Node::Empty => Ok(()),
+        Node::Empty => Ok(0),
         Node::Hash(h) => {
-            prover_inputs.push(U256::from_big_endian(h.as_bytes()));
-            Ok(())
+            trie_data.push(h2u(*h));
+
+            Ok(node_ptr)
         }
         Node::Branch { children, value } => {
+            // First, set children pointers to 0.
+            let first_child_ptr = trie_data.len();
+            trie_data.extend(vec![U256::zero(); 16]);
+            // Then, set value.
             if value.is_empty() {
-                prover_inputs.push(U256::zero()); // value_present = 0
+                trie_data.push(U256::zero());
             } else {
                 let parsed_value = parse_value(value)?;
-                prover_inputs.push(U256::one()); // value_present = 1
-                prover_inputs.extend(parsed_value);
+                trie_data.push((trie_data.len() + 1).into());
+                trie_data.extend(parsed_value);
             }
-            for child in children {
-                mpt_prover_inputs(child, prover_inputs, parse_value)?;
+
+            // Now, load all children and update their pointers.
+            for (i, child) in children.iter().enumerate() {
+                let child_ptr = load_mpt(child, trie_data, parse_value)?;
+                trie_data[first_child_ptr + i] = child_ptr.into();
             }
 
-            Ok(())
+            Ok(node_ptr)
         }
+
         Node::Extension { nibbles, child } => {
-            prover_inputs.push(nibbles.count.into());
-            prover_inputs.push(
+            trie_data.push(nibbles.count.into());
+            trie_data.push(
                 nibbles
                     .try_into_u256()
                     .map_err(|_| ProgramError::IntegerTooLarge)?,
             );
-            mpt_prover_inputs(child, prover_inputs, parse_value)
+            trie_data.push((trie_data.len() + 1).into());
+
+            let child_ptr = load_mpt(child, trie_data, parse_value)?;
+            if child_ptr == 0 {
+                trie_data.push(0.into());
+            }
+
+            Ok(node_ptr)
         }
         Node::Leaf { nibbles, value } => {
-            prover_inputs.push(nibbles.count.into());
-            prover_inputs.push(
+            trie_data.push(nibbles.count.into());
+            trie_data.push(
                 nibbles
                     .try_into_u256()
                     .map_err(|_| ProgramError::IntegerTooLarge)?,
             );
+
+            // Set `value_ptr_ptr`.
+            trie_data.push((trie_data.len() + 1).into());
+
             let leaf = parse_value(value)?;
-            prover_inputs.extend(leaf);
+            trie_data.extend(leaf);
 
-            Ok(())
+            Ok(node_ptr)
         }
     }
 }
 
-/// Like `mpt_prover_inputs`, but for the state trie, which is a bit unique since each value
-/// leads to a storage trie which we recursively traverse.
-pub(crate) fn mpt_prover_inputs_state_trie(
+fn load_state_trie(
     trie: &HashedPartialTrie,
     key: Nibbles,
-    prover_inputs: &mut Vec<U256>,
+    trie_data: &mut Vec<U256>,
     storage_tries_by_state_key: &HashMap<Nibbles, &HashedPartialTrie>,
-) -> Result<(), ProgramError> {
-    prover_inputs.push((PartialTrieType::of(trie) as u32).into());
+) -> Result<usize, ProgramError> {
+    let node_ptr = trie_data.len();
+    let type_of_trie = PartialTrieType::of(trie) as u32;
+    if type_of_trie > 0 {
+        trie_data.push(type_of_trie.into());
+    }
     match trie.deref() {
-        Node::Empty => Ok(()),
+        Node::Empty => Ok(0),
         Node::Hash(h) => {
-            prover_inputs.push(U256::from_big_endian(h.as_bytes()));
-            Ok(())
+            trie_data.push(h2u(*h));
+
+            Ok(node_ptr)
         }
         Node::Branch { children, value } => {
             if !value.is_empty() {
@@ -200,37 +226,43 @@ pub(crate) fn mpt_prover_inputs_state_trie(
                     ProverInputError::InvalidMptInput,
                 ));
             }
-            prover_inputs.push(U256::zero()); // value_present = 0
+            // First, set children pointers to 0.
+            let first_child_ptr = trie_data.len();
+            trie_data.extend(vec![U256::zero(); 16]);
+            // Then, set value pointer to 0.
+            trie_data.push(U256::zero());
 
+            // Now, load all children and update their pointers.
             for (i, child) in children.iter().enumerate() {
                 let extended_key = key.merge_nibbles(&Nibbles {
                     count: 1,
                     packed: i.into(),
                 });
-                mpt_prover_inputs_state_trie(
-                    child,
-                    extended_key,
-                    prover_inputs,
-                    storage_tries_by_state_key,
-                )?;
+                let child_ptr =
+                    load_state_trie(child, extended_key, trie_data, storage_tries_by_state_key)?;
+
+                trie_data[first_child_ptr + i] = child_ptr.into();
             }
 
-            Ok(())
+            Ok(node_ptr)
         }
         Node::Extension { nibbles, child } => {
-            prover_inputs.push(nibbles.count.into());
-            prover_inputs.push(
+            trie_data.push(nibbles.count.into());
+            trie_data.push(
                 nibbles
                     .try_into_u256()
                     .map_err(|_| ProgramError::IntegerTooLarge)?,
             );
+            // Set `value_ptr_ptr`.
+            trie_data.push((trie_data.len() + 1).into());
             let extended_key = key.merge_nibbles(nibbles);
-            mpt_prover_inputs_state_trie(
-                child,
-                extended_key,
-                prover_inputs,
-                storage_tries_by_state_key,
-            )
+            let child_ptr =
+                load_state_trie(child, extended_key, trie_data, storage_tries_by_state_key)?;
+            if child_ptr == 0 {
+                trie_data.push(0.into());
+            }
+
+            Ok(node_ptr)
         }
         Node::Leaf { nibbles, value } => {
             let account: AccountRlp = rlp::decode(value).map_err(|_| ProgramError::InvalidRlp)?;
@@ -249,34 +281,69 @@ pub(crate) fn mpt_prover_inputs_state_trie(
                 .unwrap_or(&storage_hash_only);
 
             assert_eq!(storage_trie.hash(), storage_root,
-                       "In TrieInputs, an account's storage_root didn't match the associated storage trie hash");
+                "In TrieInputs, an account's storage_root didn't match the associated storage trie hash");
 
-            prover_inputs.push(nibbles.count.into());
-            prover_inputs.push(
+            trie_data.push(nibbles.count.into());
+            trie_data.push(
                 nibbles
                     .try_into_u256()
                     .map_err(|_| ProgramError::IntegerTooLarge)?,
             );
-            prover_inputs.push(nonce);
-            prover_inputs.push(balance);
-            mpt_prover_inputs(storage_trie, prover_inputs, &parse_storage_value)?;
-            prover_inputs.push(code_hash.into_uint());
+            // Set `value_ptr_ptr`.
+            trie_data.push((trie_data.len() + 1).into());
+
+            trie_data.push(nonce);
+            trie_data.push(balance);
+            // Storage trie ptr.
+            let storage_ptr_ptr = trie_data.len();
+            trie_data.push((trie_data.len() + 2).into());
+            trie_data.push(code_hash.into_uint());
+            let storage_ptr = load_mpt(storage_trie, trie_data, &parse_storage_value)?;
+            if storage_ptr == 0 {
+                trie_data[storage_ptr_ptr] = 0.into();
+            }
 
-            Ok(())
+            Ok(node_ptr)
         }
     }
 }
 
-fn parse_storage_value(value_rlp: &[u8]) -> Result<Vec<U256>, ProgramError> {
-    let value: U256 = rlp::decode(value_rlp).map_err(|_| ProgramError::InvalidRlp)?;
-    Ok(vec![value])
-}
+pub(crate) fn load_all_mpts(
+    trie_inputs: &TrieInputs,
+) -> Result<(TrieRootPtrs, Vec<U256>), ProgramError> {
+    let mut trie_data = vec![U256::zero()];
+    let storage_tries_by_state_key = trie_inputs
+        .storage_tries
+        .iter()
+        .map(|(hashed_address, storage_trie)| {
+            let key = Nibbles::from_bytes_be(hashed_address.as_bytes())
+                .expect("An H256 is 32 bytes long");
+            (key, storage_trie)
+        })
+        .collect();
 
-fn empty_nibbles() -> Nibbles {
-    Nibbles {
-        count: 0,
-        packed: U512::zero(),
-    }
+    let state_root_ptr = load_state_trie(
+        &trie_inputs.state_trie,
+        empty_nibbles(),
+        &mut trie_data,
+        &storage_tries_by_state_key,
+    )?;
+
+    let txn_root_ptr = load_mpt(&trie_inputs.transactions_trie, &mut trie_data, &|rlp| {
+        let mut parsed_txn = vec![U256::from(rlp.len())];
+        parsed_txn.extend(rlp.iter().copied().map(U256::from));
+        Ok(parsed_txn)
+    })?;
+
+    let receipt_root_ptr = load_mpt(&trie_inputs.receipts_trie, &mut trie_data, &parse_receipts)?;
+
+    let trie_root_ptrs = TrieRootPtrs {
+        state_root_ptr,
+        txn_root_ptr,
+        receipt_root_ptr,
+    };
+
+    Ok((trie_root_ptrs, trie_data))
 }
 
 pub mod transaction_testing {
diff --git a/evm/src/generation/outputs.rs b/evm/src/generation/outputs.rs
deleted file mode 100644
index 0ce8708297..0000000000
--- a/evm/src/generation/outputs.rs
+++ /dev/null
@@ -1,109 +0,0 @@
-use std::collections::HashMap;
-
-use ethereum_types::{Address, BigEndianHash, H256, U256};
-use plonky2::field::types::Field;
-
-use crate::cpu::kernel::constants::global_metadata::GlobalMetadata::StateTrieRoot;
-use crate::generation::state::GenerationState;
-use crate::generation::trie_extractor::{
-    read_state_trie_value, read_storage_trie_value, read_trie, AccountTrieRecord,
-};
-use crate::util::u256_to_usize;
-use crate::witness::errors::ProgramError;
-
-/// The post-state after trace generation; intended for debugging.
-#[derive(Clone, Debug)]
-pub struct GenerationOutputs {
-    pub accounts: HashMap<AddressOrStateKey, AccountOutput>,
-}
-
-#[derive(Clone, Eq, PartialEq, Hash, Debug)]
-pub enum AddressOrStateKey {
-    Address(Address),
-    StateKey(H256),
-}
-
-#[derive(Clone, Debug)]
-pub struct AccountOutput {
-    pub balance: U256,
-    pub nonce: u64,
-    pub code: Vec<u8>,
-    pub storage: HashMap<U256, U256>,
-}
-
-pub(crate) fn get_outputs<F: Field>(
-    state: &mut GenerationState<F>,
-) -> Result<GenerationOutputs, ProgramError> {
-    // First observe all addresses passed in by caller.
-    for address in state.inputs.addresses.clone() {
-        state.observe_address(address);
-    }
-
-    let ptr = u256_to_usize(state.memory.read_global_metadata(StateTrieRoot))?;
-    let account_map = read_trie::<AccountTrieRecord>(&state.memory, ptr, read_state_trie_value)?;
-
-    let mut accounts = HashMap::with_capacity(account_map.len());
-
-    for (state_key_nibbles, account) in account_map.into_iter() {
-        if state_key_nibbles.count != 64 {
-            return Err(ProgramError::IntegerTooLarge);
-        }
-        let state_key_h256 = H256::from_uint(&state_key_nibbles.try_into_u256().unwrap());
-
-        let addr_or_state_key =
-            if let Some(address) = state.state_key_to_address.get(&state_key_h256) {
-                AddressOrStateKey::Address(*address)
-            } else {
-                AddressOrStateKey::StateKey(state_key_h256)
-            };
-
-        let account_output = account_trie_record_to_output(state, account)?;
-        accounts.insert(addr_or_state_key, account_output);
-    }
-
-    Ok(GenerationOutputs { accounts })
-}
-
-fn account_trie_record_to_output<F: Field>(
-    state: &GenerationState<F>,
-    account: AccountTrieRecord,
-) -> Result<AccountOutput, ProgramError> {
-    let storage = get_storage(state, account.storage_ptr)?;
-
-    // TODO: This won't work if the account was created during the txn.
-    // Need to track changes to code, similar to how we track addresses
-    // with observe_new_address.
-    let code = state
-        .inputs
-        .contract_code
-        .get(&account.code_hash)
-        .ok_or(ProgramError::UnknownContractCode)?
-        .clone();
-
-    Ok(AccountOutput {
-        balance: account.balance,
-        nonce: account.nonce,
-        storage,
-        code,
-    })
-}
-
-/// Get an account's storage trie, given a pointer to its root.
-fn get_storage<F: Field>(
-    state: &GenerationState<F>,
-    storage_ptr: usize,
-) -> Result<HashMap<U256, U256>, ProgramError> {
-    let storage_trie = read_trie::<U256>(&state.memory, storage_ptr, |x| {
-        Ok(read_storage_trie_value(x))
-    })?;
-
-    let mut map = HashMap::with_capacity(storage_trie.len());
-    for (storage_key_nibbles, value) in storage_trie.into_iter() {
-        if storage_key_nibbles.count != 64 {
-            return Err(ProgramError::IntegerTooLarge);
-        };
-        map.insert(storage_key_nibbles.try_into_u256().unwrap(), value);
-    }
-
-    Ok(map)
-}
diff --git a/evm/src/generation/prover_input.rs b/evm/src/generation/prover_input.rs
index 205dff7c66..9662d6b6b7 100644
--- a/evm/src/generation/prover_input.rs
+++ b/evm/src/generation/prover_input.rs
@@ -1,4 +1,5 @@
-use std::mem::transmute;
+use core::mem::transmute;
+use std::collections::{BTreeSet, HashMap};
 use std::str::FromStr;
 
 use anyhow::{bail, Error};
@@ -8,17 +9,21 @@ use num_bigint::BigUint;
 use plonky2::field::types::Field;
 use serde::{Deserialize, Serialize};
 
+use crate::cpu::kernel::constants::context_metadata::ContextMetadata;
 use crate::extension_tower::{FieldExt, Fp12, BLS381, BN254};
 use crate::generation::prover_input::EvmField::{
     Bls381Base, Bls381Scalar, Bn254Base, Bn254Scalar, Secp256k1Base, Secp256k1Scalar,
 };
 use crate::generation::prover_input::FieldOp::{Inverse, Sqrt};
+use crate::generation::simulate_cpu_between_labels_and_get_user_jumps;
 use crate::generation::state::GenerationState;
 use crate::memory::segments::Segment;
 use crate::memory::segments::Segment::BnPairing;
-use crate::util::{biguint_to_mem_vec, mem_vec_to_biguint, u256_to_usize};
-use crate::witness::errors::ProgramError;
+use crate::util::{biguint_to_mem_vec, mem_vec_to_biguint, u256_to_u8, u256_to_usize};
 use crate::witness::errors::ProverInputError::*;
+use crate::witness::errors::{ProgramError, ProverInputError};
+use crate::witness::memory::MemoryAddress;
+use crate::witness::operation::CONTEXT_SCALING_FACTOR;
 use crate::witness::util::{current_context_peek, stack_peek};
 
 /// Prover input function represented as a scoped function name.
@@ -35,26 +40,33 @@ impl From<Vec<String>> for ProverInputFn {
 impl<F: Field> GenerationState<F> {
     pub(crate) fn prover_input(&mut self, input_fn: &ProverInputFn) -> Result<U256, ProgramError> {
         match input_fn.0[0].as_str() {
-            "end_of_txns" => self.run_end_of_txns(),
+            "no_txn" => self.no_txn(),
+            "trie_ptr" => self.run_trie_ptr(input_fn),
             "ff" => self.run_ff(input_fn),
             "sf" => self.run_sf(input_fn),
             "ffe" => self.run_ffe(input_fn),
-            "mpt" => self.run_mpt(),
             "rlp" => self.run_rlp(),
             "current_hash" => self.run_current_hash(),
-            "account_code" => self.run_account_code(input_fn),
+            "account_code" => self.run_account_code(),
             "bignum_modmul" => self.run_bignum_modmul(),
+            "withdrawal" => self.run_withdrawal(),
+            "num_bits" => self.run_num_bits(),
+            "jumpdest_table" => self.run_jumpdest_table(input_fn),
             _ => Err(ProgramError::ProverInputError(InvalidFunction)),
         }
     }
 
-    fn run_end_of_txns(&mut self) -> Result<U256, ProgramError> {
-        let end = self.next_txn_index == self.inputs.signed_txns.len();
-        if end {
-            Ok(U256::one())
-        } else {
-            self.next_txn_index += 1;
-            Ok(U256::zero())
+    fn no_txn(&mut self) -> Result<U256, ProgramError> {
+        Ok(U256::from(self.inputs.signed_txn.is_none() as u8))
+    }
+
+    fn run_trie_ptr(&mut self, input_fn: &ProverInputFn) -> Result<U256, ProgramError> {
+        let trie = input_fn.0[1].as_str();
+        match trie {
+            "state" => Ok(U256::from(self.trie_root_ptrs.state_root_ptr)),
+            "txn" => Ok(U256::from(self.trie_root_ptrs.txn_root_ptr)),
+            "receipt" => Ok(U256::from(self.trie_root_ptrs.receipt_root_ptr)),
+            _ => Err(ProgramError::ProverInputError(InvalidInput)),
         }
     }
 
@@ -113,13 +125,6 @@ impl<F: Field> GenerationState<F> {
         Ok(field.field_extension_inverse(n, f))
     }
 
-    /// MPT data.
-    fn run_mpt(&mut self) -> Result<U256, ProgramError> {
-        self.mpt_prover_inputs
-            .pop()
-            .ok_or(ProgramError::ProverInputError(OutOfMptData))
-    }
-
     /// RLP data.
     fn run_rlp(&mut self) -> Result<U256, ProgramError> {
         self.rlp_prover_inputs
@@ -131,35 +136,26 @@ impl<F: Field> GenerationState<F> {
         Ok(U256::from_big_endian(&self.inputs.block_hashes.cur_hash.0))
     }
 
-    /// Account code.
-    fn run_account_code(&mut self, input_fn: &ProverInputFn) -> Result<U256, ProgramError> {
-        match input_fn.0[1].as_str() {
-            "length" => {
-                // Return length of code.
-                // stack: codehash, ...
-                let codehash = stack_peek(self, 0)?;
-                Ok(self
-                    .inputs
-                    .contract_code
-                    .get(&H256::from_uint(&codehash))
-                    .ok_or(ProgramError::ProverInputError(CodeHashNotFound))?
-                    .len()
-                    .into())
-            }
-            "get" => {
-                // Return `code[i]`.
-                // stack: i, code_length, codehash, ...
-                let i = stack_peek(self, 0).map(u256_to_usize)??;
-                let codehash = stack_peek(self, 2)?;
-                Ok(self
-                    .inputs
-                    .contract_code
-                    .get(&H256::from_uint(&codehash))
-                    .ok_or(ProgramError::ProverInputError(CodeHashNotFound))?[i]
-                    .into())
-            }
-            _ => Err(ProgramError::ProverInputError(InvalidInput)),
+    /// Account code loading.
+    /// Initializes the code segment of the given context with the code corresponding
+    /// to the provided hash.
+    /// Returns the length of the code.
+    fn run_account_code(&mut self) -> Result<U256, ProgramError> {
+        // stack: codehash, ctx, ...
+        let codehash = stack_peek(self, 0)?;
+        let context = stack_peek(self, 1)? >> CONTEXT_SCALING_FACTOR;
+        let context = u256_to_usize(context)?;
+        let mut address = MemoryAddress::new(context, Segment::Code, 0);
+        let code = self
+            .inputs
+            .contract_code
+            .get(&H256::from_uint(&codehash))
+            .ok_or(ProgramError::ProverInputError(CodeHashNotFound))?;
+        for &byte in code {
+            self.memory.set(address, byte.into());
+            address.increment();
         }
+        Ok(code.len().into())
     }
 
     // Bignum modular multiplication.
@@ -168,10 +164,10 @@ impl<F: Field> GenerationState<F> {
     // Subsequent calls return one limb at a time, in order (first remainder and then quotient).
     fn run_bignum_modmul(&mut self) -> Result<U256, ProgramError> {
         if self.bignum_modmul_result_limbs.is_empty() {
-            let len = stack_peek(self, 1).map(u256_to_usize)??;
-            let a_start_loc = stack_peek(self, 2).map(u256_to_usize)??;
-            let b_start_loc = stack_peek(self, 3).map(u256_to_usize)??;
-            let m_start_loc = stack_peek(self, 4).map(u256_to_usize)??;
+            let len = stack_peek(self, 2).map(u256_to_usize)??;
+            let a_start_loc = stack_peek(self, 3).map(u256_to_usize)??;
+            let b_start_loc = stack_peek(self, 4).map(u256_to_usize)??;
+            let m_start_loc = stack_peek(self, 5).map(u256_to_usize)??;
 
             let (remainder, quotient) =
                 self.bignum_modmul(len, a_start_loc, b_start_loc, m_start_loc);
@@ -198,11 +194,11 @@ impl<F: Field> GenerationState<F> {
         m_start_loc: usize,
     ) -> (Vec<U256>, Vec<U256>) {
         let n = self.memory.contexts.len();
-        let a = &self.memory.contexts[n - 1].segments[Segment::KernelGeneral as usize].content
+        let a = &self.memory.contexts[n - 1].segments[Segment::KernelGeneral.unscale()].content
             [a_start_loc..a_start_loc + len];
-        let b = &self.memory.contexts[n - 1].segments[Segment::KernelGeneral as usize].content
+        let b = &self.memory.contexts[n - 1].segments[Segment::KernelGeneral.unscale()].content
             [b_start_loc..b_start_loc + len];
-        let m = &self.memory.contexts[n - 1].segments[Segment::KernelGeneral as usize].content
+        let m = &self.memory.contexts[n - 1].segments[Segment::KernelGeneral.unscale()].content
             [m_start_loc..m_start_loc + len];
 
         let a_biguint = mem_vec_to_biguint(a);
@@ -219,6 +215,234 @@ impl<F: Field> GenerationState<F> {
 
         (biguint_to_mem_vec(rem), biguint_to_mem_vec(quo))
     }
+
+    /// Withdrawal data.
+    fn run_withdrawal(&mut self) -> Result<U256, ProgramError> {
+        self.withdrawal_prover_inputs
+            .pop()
+            .ok_or(ProgramError::ProverInputError(OutOfWithdrawalData))
+    }
+
+    /// Return the number of bits of the top of the stack or an error if
+    /// the top of the stack is zero or empty.
+    fn run_num_bits(&mut self) -> Result<U256, ProgramError> {
+        let value = stack_peek(self, 0)?;
+        if value.is_zero() {
+            Err(ProgramError::ProverInputError(NumBitsError))
+        } else {
+            let num_bits = value.bits();
+            Ok(num_bits.into())
+        }
+    }
+
+    /// Generate either the next used jump address or the proof for the last jump address.
+    fn run_jumpdest_table(&mut self, input_fn: &ProverInputFn) -> Result<U256, ProgramError> {
+        match input_fn.0[1].as_str() {
+            "next_address" => self.run_next_jumpdest_table_address(),
+            "next_proof" => self.run_next_jumpdest_table_proof(),
+            _ => Err(ProgramError::ProverInputError(InvalidInput)),
+        }
+    }
+
+    /// Returns the next used jump address.
+    fn run_next_jumpdest_table_address(&mut self) -> Result<U256, ProgramError> {
+        let context = u256_to_usize(stack_peek(self, 0)? >> CONTEXT_SCALING_FACTOR)?;
+
+        if self.jumpdest_table.is_none() {
+            self.generate_jumpdest_table()?;
+        }
+
+        let Some(jumpdest_table) = &mut self.jumpdest_table else {
+            return Err(ProgramError::ProverInputError(
+                ProverInputError::InvalidJumpdestSimulation,
+            ));
+        };
+
+        if let Some(ctx_jumpdest_table) = jumpdest_table.get_mut(&context)
+            && let Some(next_jumpdest_address) = ctx_jumpdest_table.pop()
+        {
+            Ok((next_jumpdest_address + 1).into())
+        } else {
+            self.jumpdest_table = None;
+            Ok(U256::zero())
+        }
+    }
+
+    /// Returns the proof for the last jump address.
+    fn run_next_jumpdest_table_proof(&mut self) -> Result<U256, ProgramError> {
+        let context = u256_to_usize(stack_peek(self, 1)? >> CONTEXT_SCALING_FACTOR)?;
+        let Some(jumpdest_table) = &mut self.jumpdest_table else {
+            return Err(ProgramError::ProverInputError(
+                ProverInputError::InvalidJumpdestSimulation,
+            ));
+        };
+        if let Some(ctx_jumpdest_table) = jumpdest_table.get_mut(&context)
+            && let Some(next_jumpdest_proof) = ctx_jumpdest_table.pop()
+        {
+            Ok(next_jumpdest_proof.into())
+        } else {
+            Err(ProgramError::ProverInputError(
+                ProverInputError::InvalidJumpdestSimulation,
+            ))
+        }
+    }
+}
+
+impl<F: Field> GenerationState<F> {
+    /// Simulate the user's code and store all the jump addresses with their respective contexts.
+    fn generate_jumpdest_table(&mut self) -> Result<(), ProgramError> {
+        let checkpoint = self.checkpoint();
+        let memory = self.memory.clone();
+
+        // Simulate the user's code and (unnecessarily) part of the kernel code, skipping the validate table call
+        let Some(jumpdest_table) = simulate_cpu_between_labels_and_get_user_jumps(
+            "jumpdest_analysis_end",
+            "terminate_common",
+            self,
+        ) else {
+            self.jumpdest_table = Some(HashMap::new());
+            return Ok(());
+        };
+
+        // Return to the state before starting the simulation
+        self.rollback(checkpoint);
+        self.memory = memory;
+
+        // Find proofs for all contexts
+        self.set_jumpdest_analysis_inputs(jumpdest_table);
+
+        Ok(())
+    }
+
+    /// Given a HashMap containing the contexts and the jumpdest addresses, compute their respective proofs,
+    /// by calling `get_proofs_and_jumpdests`
+    pub(crate) fn set_jumpdest_analysis_inputs(
+        &mut self,
+        jumpdest_table: HashMap<usize, BTreeSet<usize>>,
+    ) {
+        self.jumpdest_table = Some(HashMap::from_iter(jumpdest_table.into_iter().map(
+            |(ctx, jumpdest_table)| {
+                let code = self.get_code(ctx).unwrap();
+                if let Some(&largest_address) = jumpdest_table.last() {
+                    let proofs = get_proofs_and_jumpdests(&code, largest_address, jumpdest_table);
+                    (ctx, proofs)
+                } else {
+                    (ctx, vec![])
+                }
+            },
+        )));
+    }
+
+    fn get_code(&self, context: usize) -> Result<Vec<u8>, ProgramError> {
+        let code_len = self.get_code_len(context)?;
+        let code = (0..code_len)
+            .map(|i| {
+                u256_to_u8(
+                    self.memory
+                        .get(MemoryAddress::new(context, Segment::Code, i)),
+                )
+            })
+            .collect::<Result<Vec<u8>, _>>()?;
+        Ok(code)
+    }
+
+    fn get_code_len(&self, context: usize) -> Result<usize, ProgramError> {
+        let code_len = u256_to_usize(self.memory.get(MemoryAddress::new(
+            context,
+            Segment::ContextMetadata,
+            ContextMetadata::CodeSize.unscale(),
+        )))?;
+        Ok(code_len)
+    }
+}
+
+/// For all address in `jumpdest_table`, each bounded by `largest_address`,
+/// this function searches for a proof. A proof is the closest address
+/// for which none of the previous 32 bytes in the code (including opcodes
+/// and pushed bytes) are PUSHXX and the address is in its range. It returns
+/// a vector of even size containing proofs followed by their addresses.
+fn get_proofs_and_jumpdests(
+    code: &[u8],
+    largest_address: usize,
+    jumpdest_table: std::collections::BTreeSet<usize>,
+) -> Vec<usize> {
+    const PUSH1_OPCODE: u8 = 0x60;
+    const PUSH32_OPCODE: u8 = 0x7f;
+    let (proofs, _) = CodeIterator::until(code, largest_address + 1).fold(
+        (vec![], 0),
+        |(mut proofs, acc), (pos, _opcode)| {
+            let has_prefix = if let Some(prefix_start) = pos.checked_sub(32) {
+                code[prefix_start..pos]
+                    .iter()
+                    .enumerate()
+                    .fold(true, |acc, (prefix_pos, &byte)| {
+                        let cond1 = byte > PUSH32_OPCODE;
+                        let cond2 = (prefix_start + prefix_pos) as i32
+                            + (byte as i32 - PUSH1_OPCODE as i32)
+                            + 1
+                            < pos as i32;
+                        acc && (cond1 || cond2)
+                    })
+            } else {
+                false
+            };
+            let acc = if has_prefix { pos - 32 } else { acc };
+            if jumpdest_table.contains(&pos) {
+                // Push the proof
+                proofs.push(acc);
+                // Push the address
+                proofs.push(pos);
+            }
+            (proofs, acc)
+        },
+    );
+    proofs
+}
+
+/// An iterator over the EVM code contained in `code`, which skips the bytes
+/// that are the arguments of a PUSHXX opcode.
+struct CodeIterator<'a> {
+    code: &'a [u8],
+    pos: usize,
+    end: usize,
+}
+
+impl<'a> CodeIterator<'a> {
+    fn new(code: &'a [u8]) -> Self {
+        CodeIterator {
+            end: code.len(),
+            code,
+            pos: 0,
+        }
+    }
+    fn until(code: &'a [u8], end: usize) -> Self {
+        CodeIterator {
+            end: std::cmp::min(code.len(), end),
+            code,
+            pos: 0,
+        }
+    }
+}
+
+impl<'a> Iterator for CodeIterator<'a> {
+    type Item = (usize, u8);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        const PUSH1_OPCODE: u8 = 0x60;
+        const PUSH32_OPCODE: u8 = 0x7f;
+        let CodeIterator { code, pos, end } = self;
+        if *pos >= *end {
+            return None;
+        }
+        let opcode = code[*pos];
+        let old_pos = *pos;
+        *pos += if (PUSH1_OPCODE..=PUSH32_OPCODE).contains(&opcode) {
+            (opcode - PUSH1_OPCODE + 2).into()
+        } else {
+            1
+        };
+        Some((old_pos, opcode))
+    }
 }
 
 enum EvmField {
diff --git a/evm/src/generation/rlp.rs b/evm/src/generation/rlp.rs
index f28272a27b..ffc302fd54 100644
--- a/evm/src/generation/rlp.rs
+++ b/evm/src/generation/rlp.rs
@@ -1,18 +1,22 @@
 use ethereum_types::U256;
 
-pub(crate) fn all_rlp_prover_inputs_reversed(signed_txns: &[Vec<u8>]) -> Vec<U256> {
-    let mut inputs = all_rlp_prover_inputs(signed_txns);
+pub(crate) fn all_rlp_prover_inputs_reversed(signed_txn: &[u8]) -> Vec<U256> {
+    let mut inputs = all_rlp_prover_inputs(signed_txn);
     inputs.reverse();
     inputs
 }
 
-fn all_rlp_prover_inputs(signed_txns: &[Vec<u8>]) -> Vec<U256> {
+fn all_rlp_prover_inputs(signed_txn: &[u8]) -> Vec<U256> {
     let mut prover_inputs = vec![];
-    for txn in signed_txns {
-        prover_inputs.push(txn.len().into());
-        for &byte in txn {
-            prover_inputs.push(byte.into());
-        }
+    prover_inputs.push(signed_txn.len().into());
+    let mut chunks = signed_txn.chunks_exact(32);
+    for bytes in chunks.by_ref() {
+        prover_inputs.push(U256::from_big_endian(bytes));
+    }
+    let mut last_chunk = chunks.remainder().to_vec();
+    if !last_chunk.is_empty() {
+        last_chunk.extend_from_slice(&vec![0u8; 32 - last_chunk.len()]);
+        prover_inputs.push(U256::from_big_endian(&last_chunk));
     }
     prover_inputs
 }
diff --git a/evm/src/generation/state.rs b/evm/src/generation/state.rs
index aec01e1b71..a6df4b3331 100644
--- a/evm/src/generation/state.rs
+++ b/evm/src/generation/state.rs
@@ -4,9 +4,10 @@ use ethereum_types::{Address, BigEndianHash, H160, H256, U256};
 use keccak_hash::keccak;
 use plonky2::field::types::Field;
 
+use super::mpt::{load_all_mpts, TrieRootPtrs};
+use super::TrieInputs;
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::context_metadata::ContextMetadata;
-use crate::generation::mpt::all_mpt_prover_inputs_reversed;
 use crate::generation::rlp::all_rlp_prover_inputs_reversed;
 use crate::generation::GenerationInputs;
 use crate::memory::segments::Segment;
@@ -29,16 +30,12 @@ pub(crate) struct GenerationState<F: Field> {
     pub(crate) memory: MemoryState,
     pub(crate) traces: Traces<F>,
 
-    pub(crate) next_txn_index: usize,
-
-    /// Prover inputs containing MPT data, in reverse order so that the next input can be obtained
-    /// via `pop()`.
-    pub(crate) mpt_prover_inputs: Vec<U256>,
-
     /// Prover inputs containing RLP data, in reverse order so that the next input can be obtained
     /// via `pop()`.
     pub(crate) rlp_prover_inputs: Vec<U256>,
 
+    pub(crate) withdrawal_prover_inputs: Vec<U256>,
+
     /// The state trie only stores state keys, which are hashes of addresses, but sometimes it is
     /// useful to see the actual addresses for debugging. Here we store the mapping for all known
     /// addresses.
@@ -48,11 +45,28 @@ pub(crate) struct GenerationState<F: Field> {
     /// inputs are obtained in big-endian order via `pop()`). Contains both the remainder and the
     /// quotient, in that order.
     pub(crate) bignum_modmul_result_limbs: Vec<U256>,
+
+    /// Pointers, within the `TrieData` segment, of the three MPTs.
+    pub(crate) trie_root_ptrs: TrieRootPtrs,
+
+    /// A hash map where the key is a context in the user's code and the value is the set of
+    /// jump destinations with its corresponding "proof". A "proof" for a jump destination is
+    /// either 0 or an address i > 32 in the code (not necessarily pointing to an opcode) such that
+    /// for every j in [i, i+32] it holds that code[j] < 0x7f - j + i.
+    pub(crate) jumpdest_table: Option<HashMap<usize, Vec<usize>>>,
 }
 
 impl<F: Field> GenerationState<F> {
+    fn preinitialize_mpts(&mut self, trie_inputs: &TrieInputs) -> TrieRootPtrs {
+        let (trie_roots_ptrs, trie_data) =
+            load_all_mpts(trie_inputs).expect("Invalid MPT data for preinitialization");
+
+        self.memory.contexts[0].segments[Segment::TrieData.unscale()].content = trie_data;
+
+        trie_roots_ptrs
+    }
     pub(crate) fn new(inputs: GenerationInputs, kernel_code: &[u8]) -> Result<Self, ProgramError> {
-        log::debug!("Input signed_txns: {:?}", &inputs.signed_txns);
+        log::debug!("Input signed_txn: {:?}", &inputs.signed_txn);
         log::debug!("Input state_trie: {:?}", &inputs.tries.state_trie);
         log::debug!(
             "Input transactions_trie: {:?}",
@@ -61,26 +75,37 @@ impl<F: Field> GenerationState<F> {
         log::debug!("Input receipts_trie: {:?}", &inputs.tries.receipts_trie);
         log::debug!("Input storage_tries: {:?}", &inputs.tries.storage_tries);
         log::debug!("Input contract_code: {:?}", &inputs.contract_code);
-        let mpt_prover_inputs = all_mpt_prover_inputs_reversed(&inputs.tries)?;
-        let rlp_prover_inputs = all_rlp_prover_inputs_reversed(&inputs.signed_txns);
+
+        let rlp_prover_inputs =
+            all_rlp_prover_inputs_reversed(inputs.clone().signed_txn.as_ref().unwrap_or(&vec![]));
+        let withdrawal_prover_inputs = all_withdrawals_prover_inputs_reversed(&inputs.withdrawals);
         let bignum_modmul_result_limbs = Vec::new();
 
-        Ok(Self {
-            inputs,
+        let mut state = Self {
+            inputs: inputs.clone(),
             registers: Default::default(),
             memory: MemoryState::new(kernel_code),
             traces: Traces::default(),
-            next_txn_index: 0,
-            mpt_prover_inputs,
             rlp_prover_inputs,
+            withdrawal_prover_inputs,
             state_key_to_address: HashMap::new(),
             bignum_modmul_result_limbs,
-        })
+            trie_root_ptrs: TrieRootPtrs {
+                state_root_ptr: 0,
+                txn_root_ptr: 0,
+                receipt_root_ptr: 0,
+            },
+            jumpdest_table: None,
+        };
+        let trie_root_ptrs = state.preinitialize_mpts(&inputs.tries);
+
+        state.trie_root_ptrs = trie_root_ptrs;
+        Ok(state)
     }
 
     /// Updates `program_counter`, and potentially adds some extra handling if we're jumping to a
     /// special location.
-    pub fn jump_to(&mut self, dst: usize) -> Result<(), ProgramError> {
+    pub(crate) fn jump_to(&mut self, dst: usize) -> Result<(), ProgramError> {
         self.registers.program_counter = dst;
         if dst == KERNEL.global_labels["observe_new_address"] {
             let tip_u256 = stack_peek(self, 0)?;
@@ -98,26 +123,24 @@ impl<F: Field> GenerationState<F> {
 
     /// Observe the given address, so that we will be able to recognize the associated state key.
     /// This is just for debugging purposes.
-    pub fn observe_address(&mut self, address: Address) {
+    pub(crate) fn observe_address(&mut self, address: Address) {
         let state_key = keccak(address.0);
         self.state_key_to_address.insert(state_key, address);
     }
 
     /// Observe the given code hash and store the associated code.
     /// When called, the code corresponding to `codehash` should be stored in the return data.
-    pub fn observe_contract(&mut self, codehash: H256) -> Result<(), ProgramError> {
+    pub(crate) fn observe_contract(&mut self, codehash: H256) -> Result<(), ProgramError> {
         if self.inputs.contract_code.contains_key(&codehash) {
             return Ok(()); // Return early if the code hash has already been observed.
         }
 
         let ctx = self.registers.context;
-        let returndata_size_addr = MemoryAddress::new(
-            ctx,
-            Segment::ContextMetadata,
-            ContextMetadata::ReturndataSize as usize,
-        );
+        let returndata_offset = ContextMetadata::ReturndataSize.unscale();
+        let returndata_size_addr =
+            MemoryAddress::new(ctx, Segment::ContextMetadata, returndata_offset);
         let returndata_size = u256_to_usize(self.memory.get(returndata_size_addr))?;
-        let code = self.memory.contexts[ctx].segments[Segment::Returndata as usize].content
+        let code = self.memory.contexts[ctx].segments[Segment::Returndata.unscale()].content
             [..returndata_size]
             .iter()
             .map(|x| x.low_u32() as u8)
@@ -129,14 +152,14 @@ impl<F: Field> GenerationState<F> {
         Ok(())
     }
 
-    pub fn checkpoint(&self) -> GenerationStateCheckpoint {
+    pub(crate) fn checkpoint(&self) -> GenerationStateCheckpoint {
         GenerationStateCheckpoint {
             registers: self.registers,
             traces: self.traces.checkpoint(),
         }
     }
 
-    pub fn rollback(&mut self, checkpoint: GenerationStateCheckpoint) {
+    pub(crate) fn rollback(&mut self, checkpoint: GenerationStateCheckpoint) {
         self.registers = checkpoint.registers;
         self.traces.rollback(checkpoint.traces);
     }
@@ -147,4 +170,37 @@ impl<F: Field> GenerationState<F> {
             .map(|i| stack_peek(self, i).unwrap())
             .collect()
     }
+
+    /// Clones everything but the traces.
+    pub(crate) fn soft_clone(&self) -> GenerationState<F> {
+        Self {
+            inputs: self.inputs.clone(),
+            registers: self.registers,
+            memory: self.memory.clone(),
+            traces: Traces::default(),
+            rlp_prover_inputs: self.rlp_prover_inputs.clone(),
+            state_key_to_address: self.state_key_to_address.clone(),
+            bignum_modmul_result_limbs: self.bignum_modmul_result_limbs.clone(),
+            withdrawal_prover_inputs: self.withdrawal_prover_inputs.clone(),
+            trie_root_ptrs: TrieRootPtrs {
+                state_root_ptr: 0,
+                txn_root_ptr: 0,
+                receipt_root_ptr: 0,
+            },
+            jumpdest_table: None,
+        }
+    }
+}
+
+/// Withdrawals prover input array is of the form `[addr0, amount0, ..., addrN, amountN, U256::MAX, U256::MAX]`.
+/// Returns the reversed array.
+pub(crate) fn all_withdrawals_prover_inputs_reversed(withdrawals: &[(Address, U256)]) -> Vec<U256> {
+    let mut withdrawal_prover_inputs = withdrawals
+        .iter()
+        .flat_map(|w| [U256::from((w.0).0.as_slice()), w.1])
+        .collect::<Vec<_>>();
+    withdrawal_prover_inputs.push(U256::MAX);
+    withdrawal_prover_inputs.push(U256::MAX);
+    withdrawal_prover_inputs.reverse();
+    withdrawal_prover_inputs
 }
diff --git a/evm/src/generation/trie_extractor.rs b/evm/src/generation/trie_extractor.rs
index 42c50c6d75..4d3a745a19 100644
--- a/evm/src/generation/trie_extractor.rs
+++ b/evm/src/generation/trie_extractor.rs
@@ -3,11 +3,13 @@
 use std::collections::HashMap;
 
 use eth_trie_utils::nibbles::Nibbles;
+use eth_trie_utils::partial_trie::{HashedPartialTrie, Node, PartialTrie, WrappedNode};
 use ethereum_types::{BigEndianHash, H256, U256, U512};
 
+use super::mpt::{AccountRlp, LegacyReceiptRlp, LogRlp};
 use crate::cpu::kernel::constants::trie_type::PartialTrieType;
 use crate::memory::segments::Segment;
-use crate::util::u256_to_usize;
+use crate::util::{u256_to_bool, u256_to_h160, u256_to_u8, u256_to_usize};
 use crate::witness::errors::ProgramError;
 use crate::witness::memory::{MemoryAddress, MemoryState};
 
@@ -29,7 +31,7 @@ pub(crate) fn read_state_trie_value(slice: &[U256]) -> Result<AccountTrieRecord,
     })
 }
 
-pub(crate) fn read_storage_trie_value(slice: &[U256]) -> U256 {
+pub(crate) const fn read_storage_trie_value(slice: &[U256]) -> U256 {
     slice[0]
 }
 
@@ -56,7 +58,7 @@ pub(crate) fn read_trie_helper<V>(
 ) -> Result<(), ProgramError> {
     let load = |offset| memory.get(MemoryAddress::new(0, Segment::TrieData, offset));
     let load_slice_from = |init_offset| {
-        &memory.contexts[0].segments[Segment::TrieData as usize].content[init_offset..]
+        &memory.contexts[0].segments[Segment::TrieData.unscale()].content[init_offset..]
     };
 
     let trie_type = PartialTrieType::all()[u256_to_usize(load(ptr))?];
@@ -109,3 +111,203 @@ pub(crate) fn read_trie_helper<V>(
         }
     }
 }
+
+pub(crate) fn read_receipt_trie_value(
+    slice: &[U256],
+) -> Result<(Option<u8>, LegacyReceiptRlp), ProgramError> {
+    let first_value = slice[0];
+    // Skip two elements for non-legacy Receipts, and only one otherwise.
+    let (first_byte, slice) = if first_value == U256::one() || first_value == U256::from(2u8) {
+        (Some(first_value.as_u32() as u8), &slice[2..])
+    } else {
+        (None, &slice[1..])
+    };
+
+    let status = u256_to_bool(slice[0])?;
+    let cum_gas_used = slice[1];
+    let bloom = slice[2..2 + 256]
+        .iter()
+        .map(|&x| u256_to_u8(x))
+        .collect::<Result<_, _>>()?;
+    // We read the number of logs at position `2 + 256 + 1`, and skip over the next element before parsing the logs.
+    let logs = read_logs(u256_to_usize(slice[2 + 256 + 1])?, &slice[2 + 256 + 3..])?;
+
+    Ok((
+        first_byte,
+        LegacyReceiptRlp {
+            status,
+            cum_gas_used,
+            bloom,
+            logs,
+        },
+    ))
+}
+
+pub(crate) fn read_logs(num_logs: usize, slice: &[U256]) -> Result<Vec<LogRlp>, ProgramError> {
+    let mut offset = 0;
+    (0..num_logs)
+        .map(|_| {
+            let address = u256_to_h160(slice[offset])?;
+            let num_topics = u256_to_usize(slice[offset + 1])?;
+
+            let topics = (0..num_topics)
+                .map(|i| H256::from_uint(&slice[offset + 2 + i]))
+                .collect();
+
+            let data_len = u256_to_usize(slice[offset + 2 + num_topics])?;
+            let log = LogRlp {
+                address,
+                topics,
+                data: slice[offset + 2 + num_topics + 1..offset + 2 + num_topics + 1 + data_len]
+                    .iter()
+                    .map(|&x| u256_to_u8(x))
+                    .collect::<Result<_, _>>()?,
+            };
+            offset += 2 + num_topics + 1 + data_len;
+            Ok(log)
+        })
+        .collect()
+}
+
+pub(crate) fn read_state_rlp_value(
+    memory: &MemoryState,
+    slice: &[U256],
+) -> Result<Vec<u8>, ProgramError> {
+    let storage_trie: HashedPartialTrie = get_trie(memory, slice[2].as_usize(), |_, x| {
+        Ok(rlp::encode(&read_storage_trie_value(x)).to_vec())
+    })?;
+    let account = AccountRlp {
+        nonce: slice[0],
+        balance: slice[1],
+        storage_root: storage_trie.hash(),
+        code_hash: H256::from_uint(&slice[3]),
+    };
+    Ok(rlp::encode(&account).to_vec())
+}
+
+pub(crate) fn read_txn_rlp_value(
+    _memory: &MemoryState,
+    slice: &[U256],
+) -> Result<Vec<u8>, ProgramError> {
+    let txn_rlp_len = u256_to_usize(slice[0])?;
+    slice[1..txn_rlp_len + 1]
+        .iter()
+        .map(|&x| u256_to_u8(x))
+        .collect::<Result<_, _>>()
+}
+
+pub(crate) fn read_receipt_rlp_value(
+    _memory: &MemoryState,
+    slice: &[U256],
+) -> Result<Vec<u8>, ProgramError> {
+    let (first_byte, receipt) = read_receipt_trie_value(slice)?;
+    let mut bytes = rlp::encode(&receipt).to_vec();
+    if let Some(txn_byte) = first_byte {
+        bytes.insert(0, txn_byte);
+    }
+
+    Ok(bytes)
+}
+
+pub(crate) fn get_state_trie<N: PartialTrie>(
+    memory: &MemoryState,
+    ptr: usize,
+) -> Result<N, ProgramError> {
+    get_trie(memory, ptr, read_state_rlp_value)
+}
+
+pub(crate) fn get_txn_trie<N: PartialTrie>(
+    memory: &MemoryState,
+    ptr: usize,
+) -> Result<N, ProgramError> {
+    get_trie(memory, ptr, read_txn_rlp_value)
+}
+
+pub(crate) fn get_receipt_trie<N: PartialTrie>(
+    memory: &MemoryState,
+    ptr: usize,
+) -> Result<N, ProgramError> {
+    get_trie(memory, ptr, read_receipt_rlp_value)
+}
+
+pub(crate) fn get_trie<N: PartialTrie>(
+    memory: &MemoryState,
+    ptr: usize,
+    read_rlp_value: fn(&MemoryState, &[U256]) -> Result<Vec<u8>, ProgramError>,
+) -> Result<N, ProgramError> {
+    let empty_nibbles = Nibbles {
+        count: 0,
+        packed: U512::zero(),
+    };
+    Ok(N::new(get_trie_helper(
+        memory,
+        ptr,
+        read_rlp_value,
+        empty_nibbles,
+    )?))
+}
+
+pub(crate) fn get_trie_helper<N: PartialTrie>(
+    memory: &MemoryState,
+    ptr: usize,
+    read_value: fn(&MemoryState, &[U256]) -> Result<Vec<u8>, ProgramError>,
+    prefix: Nibbles,
+) -> Result<Node<N>, ProgramError> {
+    let load = |offset| memory.get(MemoryAddress::new(0, Segment::TrieData, offset));
+    let load_slice_from = |init_offset| {
+        &memory.contexts[0].segments[Segment::TrieData.unscale()].content[init_offset..]
+    };
+
+    let trie_type = PartialTrieType::all()[u256_to_usize(load(ptr))?];
+    match trie_type {
+        PartialTrieType::Empty => Ok(Node::Empty),
+        PartialTrieType::Hash => {
+            let ptr_payload = ptr + 1;
+            let hash = H256::from_uint(&load(ptr_payload));
+            Ok(Node::Hash(hash))
+        }
+        PartialTrieType::Branch => {
+            let ptr_payload = ptr + 1;
+            let children = (0..16)
+                .map(|i| {
+                    let child_ptr = u256_to_usize(load(ptr_payload + i as usize))?;
+                    get_trie_helper(memory, child_ptr, read_value, prefix.merge_nibble(i as u8))
+                })
+                .collect::<Result<Vec<_>, _>>()?;
+            let children = core::array::from_fn(|i| WrappedNode::from(children[i].clone()));
+            let value_ptr = u256_to_usize(load(ptr_payload + 16))?;
+            let mut value: Vec<u8> = vec![];
+            if value_ptr != 0 {
+                value = read_value(memory, load_slice_from(value_ptr))?;
+            };
+            Ok(Node::Branch { children, value })
+        }
+        PartialTrieType::Extension => {
+            let count = u256_to_usize(load(ptr + 1))?;
+            let packed = load(ptr + 2);
+            let nibbles = Nibbles {
+                count,
+                packed: packed.into(),
+            };
+            let child_ptr = u256_to_usize(load(ptr + 3))?;
+            let child = WrappedNode::from(get_trie_helper(
+                memory,
+                child_ptr,
+                read_value,
+                prefix.merge_nibbles(&nibbles),
+            )?);
+            Ok(Node::Extension { nibbles, child })
+        }
+        PartialTrieType::Leaf => {
+            let count = u256_to_usize(load(ptr + 1))?;
+            let packed = load(ptr + 2);
+            let nibbles = Nibbles {
+                count,
+                packed: packed.into(),
+            };
+            let value_ptr = u256_to_usize(load(ptr + 3))?;
+            let value = read_value(memory, load_slice_from(value_ptr))?;
+            Ok(Node::Leaf { nibbles, value })
+        }
+    }
+}
diff --git a/evm/src/get_challenges.rs b/evm/src/get_challenges.rs
index e9e5de9360..756b0650da 100644
--- a/evm/src/get_challenges.rs
+++ b/evm/src/get_challenges.rs
@@ -61,16 +61,12 @@ fn observe_block_metadata<
     challenger.observe_element(u256_to_u32(block_metadata.block_number)?);
     challenger.observe_element(u256_to_u32(block_metadata.block_difficulty)?);
     challenger.observe_elements(&h256_limbs::<F>(block_metadata.block_random));
-    let gaslimit = u256_to_u64(block_metadata.block_gaslimit)?;
-    challenger.observe_element(gaslimit.0);
-    challenger.observe_element(gaslimit.1);
+    challenger.observe_element(u256_to_u32(block_metadata.block_gaslimit)?);
     challenger.observe_element(u256_to_u32(block_metadata.block_chain_id)?);
     let basefee = u256_to_u64(block_metadata.block_base_fee)?;
     challenger.observe_element(basefee.0);
     challenger.observe_element(basefee.1);
-    let gas_used = u256_to_u64(block_metadata.block_gas_used)?;
-    challenger.observe_element(gas_used.0);
-    challenger.observe_element(gas_used.1);
+    challenger.observe_element(u256_to_u32(block_metadata.block_gas_used)?);
     for i in 0..8 {
         challenger.observe_elements(&u256_limbs(block_metadata.block_bloom[i]));
     }
@@ -93,10 +89,10 @@ fn observe_block_metadata_target<
     challenger.observe_element(block_metadata.block_number);
     challenger.observe_element(block_metadata.block_difficulty);
     challenger.observe_elements(&block_metadata.block_random);
-    challenger.observe_elements(&block_metadata.block_gaslimit);
+    challenger.observe_element(block_metadata.block_gaslimit);
     challenger.observe_element(block_metadata.block_chain_id);
     challenger.observe_elements(&block_metadata.block_base_fee);
-    challenger.observe_elements(&block_metadata.block_gas_used);
+    challenger.observe_element(block_metadata.block_gas_used);
     challenger.observe_elements(&block_metadata.block_bloom);
 }
 
@@ -108,21 +104,11 @@ fn observe_extra_block_data<
     challenger: &mut Challenger<F, C::Hasher>,
     extra_data: &ExtraBlockData,
 ) -> Result<(), ProgramError> {
-    challenger.observe_elements(&h256_limbs(extra_data.genesis_state_trie_root));
+    challenger.observe_elements(&h256_limbs(extra_data.checkpoint_state_trie_root));
     challenger.observe_element(u256_to_u32(extra_data.txn_number_before)?);
     challenger.observe_element(u256_to_u32(extra_data.txn_number_after)?);
-    let gas_used_before = u256_to_u64(extra_data.gas_used_before)?;
-    challenger.observe_element(gas_used_before.0);
-    challenger.observe_element(gas_used_before.1);
-    let gas_used_after = u256_to_u64(extra_data.gas_used_after)?;
-    challenger.observe_element(gas_used_after.0);
-    challenger.observe_element(gas_used_after.1);
-    for i in 0..8 {
-        challenger.observe_elements(&u256_limbs(extra_data.block_bloom_before[i]));
-    }
-    for i in 0..8 {
-        challenger.observe_elements(&u256_limbs(extra_data.block_bloom_after[i]));
-    }
+    challenger.observe_element(u256_to_u32(extra_data.gas_used_before)?);
+    challenger.observe_element(u256_to_u32(extra_data.gas_used_after)?);
 
     Ok(())
 }
@@ -137,13 +123,11 @@ fn observe_extra_block_data_target<
 ) where
     C::Hasher: AlgebraicHasher<F>,
 {
-    challenger.observe_elements(&extra_data.genesis_state_trie_root);
+    challenger.observe_elements(&extra_data.checkpoint_state_trie_root);
     challenger.observe_element(extra_data.txn_number_before);
     challenger.observe_element(extra_data.txn_number_after);
-    challenger.observe_elements(&extra_data.gas_used_before);
-    challenger.observe_elements(&extra_data.gas_used_after);
-    challenger.observe_elements(&extra_data.block_bloom_before);
-    challenger.observe_elements(&extra_data.block_bloom_after);
+    challenger.observe_element(extra_data.gas_used_before);
+    challenger.observe_element(extra_data.gas_used_after);
 }
 
 fn observe_block_hashes<
diff --git a/evm/src/keccak/columns.rs b/evm/src/keccak/columns.rs
index d9a71af4ff..eedba41c0f 100644
--- a/evm/src/keccak/columns.rs
+++ b/evm/src/keccak/columns.rs
@@ -1,10 +1,10 @@
 use plonky2::field::types::Field;
 
-use crate::cross_table_lookup::Column;
 use crate::keccak::keccak_stark::{NUM_INPUTS, NUM_ROUNDS};
+use crate::lookup::Column;
 
 /// A register which is set to 1 if we are in the `i`th round, otherwise 0.
-pub const fn reg_step(i: usize) -> usize {
+pub(crate) const fn reg_step(i: usize) -> usize {
     debug_assert!(i < NUM_ROUNDS);
     i
 }
@@ -12,7 +12,7 @@ pub const fn reg_step(i: usize) -> usize {
 /// Registers to hold permutation inputs.
 /// `reg_input_limb(2*i) -> input[i] as u32`
 /// `reg_input_limb(2*i+1) -> input[i] >> 32`
-pub fn reg_input_limb<F: Field>(i: usize) -> Column<F> {
+pub(crate) fn reg_input_limb<F: Field>(i: usize) -> Column<F> {
     debug_assert!(i < 2 * NUM_INPUTS);
     let i_u64 = i / 2; // The index of the 64-bit chunk.
 
@@ -28,7 +28,7 @@ pub fn reg_input_limb<F: Field>(i: usize) -> Column<F> {
 /// Registers to hold permutation outputs.
 /// `reg_output_limb(2*i) -> output[i] as u32`
 /// `reg_output_limb(2*i+1) -> output[i] >> 32`
-pub const fn reg_output_limb(i: usize) -> usize {
+pub(crate) const fn reg_output_limb(i: usize) -> usize {
     debug_assert!(i < 2 * NUM_INPUTS);
     let i_u64 = i / 2; // The index of the 64-bit chunk.
 
diff --git a/evm/src/keccak/keccak_stark.rs b/evm/src/keccak/keccak_stark.rs
index 2745d03302..771c9b4371 100644
--- a/evm/src/keccak/keccak_stark.rs
+++ b/evm/src/keccak/keccak_stark.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use itertools::Itertools;
 use plonky2::field::extension::{Extendable, FieldExtension};
@@ -13,7 +13,6 @@ use plonky2::util::timing::TimingTree;
 
 use super::columns::reg_input_limb;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::cross_table_lookup::Column;
 use crate::evaluation_frame::{StarkEvaluationFrame, StarkFrame};
 use crate::keccak::columns::{
     reg_a, reg_a_prime, reg_a_prime_prime, reg_a_prime_prime_0_0_bit, reg_a_prime_prime_prime,
@@ -24,6 +23,7 @@ use crate::keccak::logic::{
     andn, andn_gen, andn_gen_circuit, xor, xor3_gen, xor3_gen_circuit, xor_gen, xor_gen_circuit,
 };
 use crate::keccak::round_flags::{eval_round_flags, eval_round_flags_recursively};
+use crate::lookup::{Column, Filter};
 use crate::stark::Stark;
 use crate::util::trace_rows_to_poly_values;
 
@@ -33,28 +33,32 @@ pub(crate) const NUM_ROUNDS: usize = 24;
 /// Number of 64-bit elements in the Keccak permutation input.
 pub(crate) const NUM_INPUTS: usize = 25;
 
-pub fn ctl_data_inputs<F: Field>() -> Vec<Column<F>> {
+/// Create vector of `Columns` corresponding to the permutation input limbs.
+pub(crate) fn ctl_data_inputs<F: Field>() -> Vec<Column<F>> {
     let mut res: Vec<_> = (0..2 * NUM_INPUTS).map(reg_input_limb).collect();
     res.push(Column::single(TIMESTAMP));
     res
 }
 
-pub fn ctl_data_outputs<F: Field>() -> Vec<Column<F>> {
+/// Create vector of `Columns` corresponding to the permutation output limbs.
+pub(crate) fn ctl_data_outputs<F: Field>() -> Vec<Column<F>> {
     let mut res: Vec<_> = Column::singles((0..2 * NUM_INPUTS).map(reg_output_limb)).collect();
     res.push(Column::single(TIMESTAMP));
     res
 }
 
-pub fn ctl_filter_inputs<F: Field>() -> Column<F> {
-    Column::single(reg_step(0))
+/// CTL filter for the first round of the Keccak permutation.
+pub(crate) fn ctl_filter_inputs<F: Field>() -> Filter<F> {
+    Filter::new_simple(Column::single(reg_step(0)))
 }
 
-pub fn ctl_filter_outputs<F: Field>() -> Column<F> {
-    Column::single(reg_step(NUM_ROUNDS - 1))
+/// CTL filter for the final round of the Keccak permutation.
+pub(crate) fn ctl_filter_outputs<F: Field>() -> Filter<F> {
+    Filter::new_simple(Column::single(reg_step(NUM_ROUNDS - 1)))
 }
 
 #[derive(Copy, Clone, Default)]
-pub struct KeccakStark<F, const D: usize> {
+pub(crate) struct KeccakStark<F, const D: usize> {
     pub(crate) f: PhantomData<F>,
 }
 
@@ -227,7 +231,7 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakStark<F, D> {
         row[out_reg_hi] = F::from_canonical_u64(row[in_reg_hi].to_canonical_u64() ^ rc_hi);
     }
 
-    pub fn generate_trace(
+    pub(crate) fn generate_trace(
         &self,
         inputs: Vec<([u64; NUM_INPUTS], usize)>,
         min_rows: usize,
@@ -618,21 +622,16 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakStark<F
 mod tests {
     use anyhow::Result;
     use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
-    use plonky2::field::polynomial::PolynomialValues;
-    use plonky2::field::types::{Field, PrimeField64};
+    use plonky2::field::types::PrimeField64;
     use plonky2::fri::oracle::PolynomialBatch;
     use plonky2::iop::challenger::Challenger;
     use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-    use plonky2::timed;
-    use plonky2::util::timing::TimingTree;
     use tiny_keccak::keccakf;
 
+    use super::*;
     use crate::config::StarkConfig;
-    use crate::cross_table_lookup::{
-        CtlData, CtlZData, GrandProductChallenge, GrandProductChallengeSet,
-    };
-    use crate::keccak::columns::reg_output_limb;
-    use crate::keccak::keccak_stark::{KeccakStark, NUM_INPUTS, NUM_ROUNDS};
+    use crate::cross_table_lookup::{CtlData, CtlZData, GrandProductChallengeSet};
+    use crate::lookup::GrandProductChallenge;
     use crate::prover::prove_single_table;
     use crate::stark_testing::{test_stark_circuit_constraints, test_stark_low_degree};
 
@@ -718,8 +717,6 @@ mod tests {
             stark.generate_trace(input, 8, &mut timing)
         );
 
-        // TODO: Cloning this isn't great; consider having `from_values` accept a reference,
-        // or having `compute_permutation_z_polys` read trace values from the `PolynomialBatch`.
         let cloned_trace_poly_values = timed!(timing, "clone", trace_poly_values.clone());
 
         let trace_commitments = timed!(
@@ -738,13 +735,14 @@ mod tests {
 
         // Fake CTL data.
         let ctl_z_data = CtlZData {
+            helper_columns: vec![PolynomialValues::zero(degree)],
             z: PolynomialValues::zero(degree),
             challenge: GrandProductChallenge {
                 beta: F::ZERO,
                 gamma: F::ZERO,
             },
             columns: vec![],
-            filter_column: None,
+            filter: vec![Some(Filter::new_simple(Column::constant(F::ZERO)))],
         };
         let ctl_data = CtlData {
             zs_columns: vec![ctl_z_data.clone(); config.num_challenges],
@@ -761,6 +759,7 @@ mod tests {
             },
             &mut Challenger::new(),
             &mut timing,
+            None,
         )?;
 
         timing.print();
diff --git a/evm/src/keccak_sponge/columns.rs b/evm/src/keccak_sponge/columns.rs
index 431c09e092..b35ff1fa10 100644
--- a/evm/src/keccak_sponge/columns.rs
+++ b/evm/src/keccak_sponge/columns.rs
@@ -1,19 +1,30 @@
-use std::borrow::{Borrow, BorrowMut};
-use std::mem::{size_of, transmute};
+use core::borrow::{Borrow, BorrowMut};
+use core::mem::{size_of, transmute};
+use core::ops::Range;
 
 use crate::util::{indices_arr, transmute_no_compile_time_size_checks};
 
+/// Total number of sponge bytes: number of rate bytes + number of capacity bytes.
 pub(crate) const KECCAK_WIDTH_BYTES: usize = 200;
+/// Total number of 32-bit limbs in the sponge.
 pub(crate) const KECCAK_WIDTH_U32S: usize = KECCAK_WIDTH_BYTES / 4;
+/// Number of non-digest bytes.
 pub(crate) const KECCAK_WIDTH_MINUS_DIGEST_U32S: usize =
     (KECCAK_WIDTH_BYTES - KECCAK_DIGEST_BYTES) / 4;
+/// Number of rate bytes.
 pub(crate) const KECCAK_RATE_BYTES: usize = 136;
+/// Number of 32-bit rate limbs.
 pub(crate) const KECCAK_RATE_U32S: usize = KECCAK_RATE_BYTES / 4;
+/// Number of capacity bytes.
 pub(crate) const KECCAK_CAPACITY_BYTES: usize = 64;
+/// Number of 32-bit capacity limbs.
 pub(crate) const KECCAK_CAPACITY_U32S: usize = KECCAK_CAPACITY_BYTES / 4;
+/// Number of output digest bytes used during the squeezing phase.
 pub(crate) const KECCAK_DIGEST_BYTES: usize = 32;
+/// Number of 32-bit digest limbs.
 pub(crate) const KECCAK_DIGEST_U32S: usize = KECCAK_DIGEST_BYTES / 4;
 
+/// A view of `KeccakSpongeStark`'s columns.
 #[repr(C)]
 #[derive(Eq, PartialEq, Debug)]
 pub(crate) struct KeccakSpongeColumnsView<T: Copy> {
@@ -21,17 +32,16 @@ pub(crate) struct KeccakSpongeColumnsView<T: Copy> {
     /// not a padding byte; 0 otherwise.
     pub is_full_input_block: T,
 
-    // The base address at which we will read the input block.
+    /// The context of the base address at which we will read the input block.
     pub context: T,
+    /// The segment of the base address at which we will read the input block.
     pub segment: T,
+    /// The virtual address at which we will read the input block.
     pub virt: T,
 
     /// The timestamp at which inputs should be read from memory.
     pub timestamp: T,
 
-    /// The length of the original input, in bytes.
-    pub len: T,
-
     /// The number of input bytes that have already been absorbed prior to this block.
     pub already_absorbed_bytes: T,
 
@@ -63,10 +73,35 @@ pub(crate) struct KeccakSpongeColumnsView<T: Copy> {
     /// The first part of the state of the sponge, seen as bytes, after the permutation is applied.
     /// This also represents the output digest of the Keccak sponge during the squeezing phase.
     pub updated_digest_state_bytes: [T; KECCAK_DIGEST_BYTES],
+
+    /// The counter column (used for the range check) starts from 0 and increments.
+    pub range_counter: T,
+    /// The frequencies column used in logUp.
+    pub rc_frequencies: T,
 }
 
 // `u8` is guaranteed to have a `size_of` of 1.
-pub const NUM_KECCAK_SPONGE_COLUMNS: usize = size_of::<KeccakSpongeColumnsView<u8>>();
+/// Number of columns in `KeccakSpongeStark`.
+pub(crate) const NUM_KECCAK_SPONGE_COLUMNS: usize = size_of::<KeccakSpongeColumnsView<u8>>();
+
+// Indices for LogUp range-check.
+// They are on the last registers of this table.
+pub(crate) const RC_FREQUENCIES: usize = NUM_KECCAK_SPONGE_COLUMNS - 1;
+pub(crate) const RANGE_COUNTER: usize = RC_FREQUENCIES - 1;
+
+pub(crate) const BLOCK_BYTES_START: usize =
+    6 + KECCAK_RATE_BYTES + KECCAK_RATE_U32S + KECCAK_CAPACITY_U32S;
+/// Indices for the range-checked values, i.e. the `block_bytes` section.
+// TODO: Find a better way to access those indices
+pub(crate) const fn get_block_bytes_range() -> Range<usize> {
+    BLOCK_BYTES_START..BLOCK_BYTES_START + KECCAK_RATE_BYTES
+}
+
+/// Return the index for the targeted `block_bytes` element.
+pub(crate) const fn get_single_block_bytes_value(i: usize) -> usize {
+    debug_assert!(i < KECCAK_RATE_BYTES);
+    get_block_bytes_range().start + i
+}
 
 impl<T: Copy> From<[T; NUM_KECCAK_SPONGE_COLUMNS]> for KeccakSpongeColumnsView<T> {
     fn from(value: [T; NUM_KECCAK_SPONGE_COLUMNS]) -> Self {
@@ -117,4 +152,5 @@ const fn make_col_map() -> KeccakSpongeColumnsView<usize> {
     }
 }
 
+/// Map between the `KeccakSponge` columns and (0..`NUM_KECCAK_SPONGE_COLUMNS`)
 pub(crate) const KECCAK_SPONGE_COL_MAP: KeccakSpongeColumnsView<usize> = make_col_map();
diff --git a/evm/src/keccak_sponge/keccak_sponge_stark.rs b/evm/src/keccak_sponge/keccak_sponge_stark.rs
index e491252ba8..ddf2bca00e 100644
--- a/evm/src/keccak_sponge/keccak_sponge_stark.rs
+++ b/evm/src/keccak_sponge/keccak_sponge_stark.rs
@@ -1,7 +1,7 @@
-use std::borrow::Borrow;
-use std::iter::{once, repeat};
-use std::marker::PhantomData;
-use std::mem::size_of;
+use core::borrow::Borrow;
+use core::iter::{self, once, repeat};
+use core::marker::PhantomData;
+use core::mem::size_of;
 
 use itertools::Itertools;
 use plonky2::field::extension::{Extendable, FieldExtension};
@@ -12,17 +12,25 @@ use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::timed;
 use plonky2::util::timing::TimingTree;
+use plonky2::util::transpose;
 use plonky2_util::ceil_div_usize;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::kernel::keccak_util::keccakf_u32s;
-use crate::cross_table_lookup::Column;
 use crate::evaluation_frame::{StarkEvaluationFrame, StarkFrame};
 use crate::keccak_sponge::columns::*;
+use crate::lookup::{Column, Filter, Lookup};
 use crate::stark::Stark;
-use crate::util::trace_rows_to_poly_values;
 use crate::witness::memory::MemoryAddress;
 
+/// Strict upper bound for the individual bytes range-check.
+const BYTE_RANGE_MAX: usize = 256;
+
+/// Creates the vector of `Columns` corresponding to:
+/// - the address in memory of the inputs,
+/// - the length of the inputs,
+/// - the timestamp at which the inputs are read from memory,
+/// - the output limbs of the Keccak sponge.
 pub(crate) fn ctl_looked_data<F: Field>() -> Vec<Column<F>> {
     let cols = KECCAK_SPONGE_COL_MAP;
     let mut outputs = Vec::with_capacity(8);
@@ -36,17 +44,28 @@ pub(crate) fn ctl_looked_data<F: Field>() -> Vec<Column<F>> {
         outputs.push(cur_col);
     }
 
-    Column::singles([
-        cols.context,
-        cols.segment,
-        cols.virt,
-        cols.len,
-        cols.timestamp,
-    ])
-    .chain(outputs)
-    .collect()
+    // The length of the inputs is `already_absorbed_bytes + is_final_input_len`.
+    let len_col = Column::linear_combination(
+        iter::once((cols.already_absorbed_bytes, F::ONE)).chain(
+            cols.is_final_input_len
+                .iter()
+                .enumerate()
+                .map(|(i, &elt)| (elt, F::from_canonical_usize(i))),
+        ),
+    );
+
+    let mut res: Vec<Column<F>> =
+        Column::singles([cols.context, cols.segment, cols.virt]).collect();
+    res.push(len_col);
+    res.push(Column::single(cols.timestamp));
+    res.extend(outputs);
+
+    res
 }
 
+/// Creates the vector of `Columns` corresponding to the inputs of the Keccak sponge.
+/// This is used to check that the inputs of the sponge correspond to the inputs
+/// given by `KeccakStark`.
 pub(crate) fn ctl_looking_keccak_inputs<F: Field>() -> Vec<Column<F>> {
     let cols = KECCAK_SPONGE_COL_MAP;
     let mut res: Vec<_> = Column::singles(
@@ -62,6 +81,9 @@ pub(crate) fn ctl_looking_keccak_inputs<F: Field>() -> Vec<Column<F>> {
     res
 }
 
+/// Creates the vector of `Columns` corresponding to the outputs of the Keccak sponge.
+/// This is used to check that the outputs of the sponge correspond to the outputs
+/// given by `KeccakStark`.
 pub(crate) fn ctl_looking_keccak_outputs<F: Field>() -> Vec<Column<F>> {
     let cols = KECCAK_SPONGE_COL_MAP;
 
@@ -83,6 +105,7 @@ pub(crate) fn ctl_looking_keccak_outputs<F: Field>() -> Vec<Column<F>> {
     res
 }
 
+/// Creates the vector of `Columns` corresponding to the address and value of the byte being read from memory.
 pub(crate) fn ctl_looking_memory<F: Field>(i: usize) -> Vec<Column<F>> {
     let cols = KECCAK_SPONGE_COL_MAP;
 
@@ -111,12 +134,16 @@ pub(crate) fn ctl_looking_memory<F: Field>(i: usize) -> Vec<Column<F>> {
     res
 }
 
-pub(crate) fn num_logic_ctls() -> usize {
+/// Returns the number of `KeccakSponge` tables looking into the `LogicStark`.
+pub(crate) const fn num_logic_ctls() -> usize {
     const U8S_PER_CTL: usize = 32;
     ceil_div_usize(KECCAK_RATE_BYTES, U8S_PER_CTL)
 }
 
-/// CTL for performing the `i`th logic CTL. Since we need to do 136 byte XORs, and the logic CTL can
+/// Creates the vector of `Columns` required to perform the `i`th logic CTL.
+/// It is comprised of the ÌS_XOR` flag, the two inputs and the output
+/// of the XOR operation.
+/// Since we need to do 136 byte XORs, and the logic CTL can
 /// XOR 32 bytes per CTL, there are 5 such CTLs.
 pub(crate) fn ctl_looking_logic<F: Field>(i: usize) -> Vec<Column<F>> {
     const U32S_PER_CTL: usize = 8;
@@ -156,34 +183,42 @@ pub(crate) fn ctl_looking_logic<F: Field>(i: usize) -> Vec<Column<F>> {
     res
 }
 
-pub(crate) fn ctl_looked_filter<F: Field>() -> Column<F> {
+/// CTL filter for the final block rows of the `KeccakSponge` table.
+pub(crate) fn ctl_looked_filter<F: Field>() -> Filter<F> {
     // The CPU table is only interested in our final-block rows, since those contain the final
     // sponge output.
-    Column::sum(KECCAK_SPONGE_COL_MAP.is_final_input_len)
+    Filter::new_simple(Column::sum(KECCAK_SPONGE_COL_MAP.is_final_input_len))
 }
 
 /// CTL filter for reading the `i`th byte of input from memory.
-pub(crate) fn ctl_looking_memory_filter<F: Field>(i: usize) -> Column<F> {
+pub(crate) fn ctl_looking_memory_filter<F: Field>(i: usize) -> Filter<F> {
     // We perform the `i`th read if either
     // - this is a full input block, or
     // - this is a final block of length `i` or greater
     let cols = KECCAK_SPONGE_COL_MAP;
     if i == KECCAK_RATE_BYTES - 1 {
-        Column::single(cols.is_full_input_block)
+        Filter::new_simple(Column::single(cols.is_full_input_block))
     } else {
-        Column::sum(once(&cols.is_full_input_block).chain(&cols.is_final_input_len[i + 1..]))
+        Filter::new_simple(Column::sum(
+            once(&cols.is_full_input_block).chain(&cols.is_final_input_len[i + 1..]),
+        ))
     }
 }
 
 /// CTL filter for looking at XORs in the logic table.
-pub(crate) fn ctl_looking_logic_filter<F: Field>() -> Column<F> {
+pub(crate) fn ctl_looking_logic_filter<F: Field>() -> Filter<F> {
     let cols = KECCAK_SPONGE_COL_MAP;
-    Column::sum(once(&cols.is_full_input_block).chain(&cols.is_final_input_len))
+    Filter::new_simple(Column::sum(
+        once(&cols.is_full_input_block).chain(&cols.is_final_input_len),
+    ))
 }
 
-pub(crate) fn ctl_looking_keccak_filter<F: Field>() -> Column<F> {
+/// CTL filter for looking at the input and output in the Keccak table.
+pub(crate) fn ctl_looking_keccak_filter<F: Field>() -> Filter<F> {
     let cols = KECCAK_SPONGE_COL_MAP;
-    Column::sum(once(&cols.is_full_input_block).chain(&cols.is_final_input_len))
+    Filter::new_simple(Column::sum(
+        once(&cols.is_full_input_block).chain(&cols.is_final_input_len),
+    ))
 }
 
 /// Information about a Keccak sponge operation needed for witness generation.
@@ -199,12 +234,14 @@ pub(crate) struct KeccakSpongeOp {
     pub(crate) input: Vec<u8>,
 }
 
+/// Structure representing the `KeccakSponge` STARK, which carries out the sponge permutation.
 #[derive(Copy, Clone, Default)]
-pub struct KeccakSpongeStark<F, const D: usize> {
+pub(crate) struct KeccakSpongeStark<F, const D: usize> {
     f: PhantomData<F>,
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
+    /// Generates the trace polynomial values for the `KeccakSponge`STARK.
     pub(crate) fn generate_trace(
         &self,
         operations: Vec<KeccakSpongeOp>,
@@ -218,15 +255,16 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
             self.generate_trace_rows(operations, min_rows)
         );
 
-        let trace_polys = timed!(
-            timing,
-            "convert to PolynomialValues",
-            trace_rows_to_poly_values(trace_rows)
-        );
+        let trace_row_vecs: Vec<_> = trace_rows.into_iter().map(|row| row.to_vec()).collect();
 
-        trace_polys
+        let mut trace_cols = transpose(&trace_row_vecs);
+        self.generate_range_checks(&mut trace_cols);
+
+        trace_cols.into_iter().map(PolynomialValues::new).collect()
     }
 
+    /// Generates the trace rows given the vector of `KeccakSponge` operations.
+    /// The trace is padded to a power of two with all-zero rows.
     fn generate_trace_rows(
         &self,
         operations: Vec<KeccakSpongeOp>,
@@ -237,9 +275,11 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
             .map(|op| op.input.len() / KECCAK_RATE_BYTES + 1)
             .sum();
         let mut rows = Vec::with_capacity(base_len.max(min_rows).next_power_of_two());
+        // Generate active rows.
         for op in operations {
             rows.extend(self.generate_rows_for_op(op));
         }
+        // Pad the trace.
         let padded_rows = rows.len().max(min_rows).next_power_of_two();
         for _ in rows.len()..padded_rows {
             rows.push(self.generate_padding_row());
@@ -247,6 +287,9 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
         rows
     }
 
+    /// Generates the rows associated to a given operation:
+    /// Performs a Keccak sponge permutation and fills the STARK's rows accordingly.
+    /// The number of rows is the number of input chunks of size `KECCAK_RATE_BYTES`.
     fn generate_rows_for_op(&self, op: KeccakSpongeOp) -> Vec<[F; NUM_KECCAK_SPONGE_COLUMNS]> {
         let mut rows = Vec::with_capacity(op.input.len() / KECCAK_RATE_BYTES + 1);
 
@@ -255,6 +298,7 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
         let mut input_blocks = op.input.chunks_exact(KECCAK_RATE_BYTES);
         let mut already_absorbed_bytes = 0;
         for block in input_blocks.by_ref() {
+            // We compute the updated state of the sponge.
             let row = self.generate_full_input_row(
                 &op,
                 already_absorbed_bytes,
@@ -262,6 +306,9 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
                 block.try_into().unwrap(),
             );
 
+            // We update the state limbs for the next block absorption.
+            // The first `KECCAK_DIGEST_U32s` limbs are stored as bytes after the computation,
+            // so we recompute the corresponding `u32` and update the first state limbs.
             sponge_state[..KECCAK_DIGEST_U32S]
                 .iter_mut()
                 .zip(row.updated_digest_state_bytes.chunks_exact(4))
@@ -273,6 +320,8 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
                         .sum();
                 });
 
+            // The rest of the bytes are already stored in the expected form, so we can directly
+            // update the state with the stored values.
             sponge_state[KECCAK_DIGEST_U32S..]
                 .iter_mut()
                 .zip(row.partial_updated_state_u32s)
@@ -295,6 +344,8 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
         rows
     }
 
+    /// Generates a row where all bytes are input bytes, not padding bytes.
+    /// This includes updating the state sponge with a single absorption.
     fn generate_full_input_row(
         &self,
         op: &KeccakSpongeOp,
@@ -313,6 +364,10 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
         row
     }
 
+    /// Generates a row containing the last input bytes.
+    /// On top of computing one absorption and padding the input,
+    /// we indicate the last non-padding input byte by setting
+    /// `row.is_final_input_len[final_inputs.len()]` to 1.
     fn generate_final_row(
         &self,
         op: &KeccakSpongeOp,
@@ -345,6 +400,9 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
 
     /// Generate fields that are common to both full-input-block rows and final-block rows.
     /// Also updates the sponge state with a single absorption.
+    /// Given a state S = R || C and a block input B,
+    /// - R is updated with R XOR B,
+    /// - S is replaced by keccakf_u32s(S).
     fn generate_common_fields(
         row: &mut KeccakSpongeColumnsView<F>,
         op: &KeccakSpongeOp,
@@ -355,7 +413,6 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
         row.segment = F::from_canonical_usize(op.base_address.segment);
         row.virt = F::from_canonical_usize(op.base_address.virt);
         row.timestamp = F::from_canonical_usize(op.timestamp);
-        row.len = F::from_canonical_usize(op.input.len());
         row.already_absorbed_bytes = F::from_canonical_usize(already_absorbed_bytes);
 
         row.original_rate_u32s = sponge_state[..KECCAK_RATE_U32S]
@@ -428,6 +485,38 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
         // indicating that it's a dummy/padding row.
         KeccakSpongeColumnsView::default().into()
     }
+
+    /// Expects input in *column*-major layout
+    fn generate_range_checks(&self, cols: &mut [Vec<F>]) {
+        debug_assert!(cols.len() == NUM_KECCAK_SPONGE_COLUMNS);
+
+        let n_rows = cols[0].len();
+        debug_assert!(cols.iter().all(|col| col.len() == n_rows));
+
+        for i in 0..BYTE_RANGE_MAX {
+            cols[RANGE_COUNTER][i] = F::from_canonical_usize(i);
+        }
+        for i in BYTE_RANGE_MAX..n_rows {
+            cols[RANGE_COUNTER][i] = F::from_canonical_usize(BYTE_RANGE_MAX - 1);
+        }
+
+        // For each column c in cols, generate the range-check
+        // permutations and put them in the corresponding range-check
+        // columns rc_c and rc_c+1.
+        for col in 0..KECCAK_RATE_BYTES {
+            let c = get_single_block_bytes_value(col);
+            for i in 0..n_rows {
+                let x = cols[c][i].to_canonical_u64() as usize;
+                assert!(
+                    x < BYTE_RANGE_MAX,
+                    "column value {} exceeds the max range value {}",
+                    x,
+                    BYTE_RANGE_MAX
+                );
+                cols[RC_FREQUENCIES][x] += F::ONE;
+            }
+        }
+    }
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeStark<F, D> {
@@ -453,6 +542,17 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
             vars.get_next_values().try_into().unwrap();
         let next_values: &KeccakSpongeColumnsView<P> = next_values.borrow();
 
+        // Check the range column: First value must be 0, last row
+        // must be 255, and intermediate rows must increment by 0
+        // or 1.
+        let rc1 = local_values.range_counter;
+        let rc2 = next_values.range_counter;
+        yield_constr.constraint_first_row(rc1);
+        let incr = rc2 - rc1;
+        yield_constr.constraint_transition(incr * incr - incr);
+        let range_max = P::Scalar::from_canonical_u64((BYTE_RANGE_MAX - 1) as u64);
+        yield_constr.constraint_last_row(rc1 - range_max);
+
         // Each flag (full-input block, final block or implied dummy flag) must be boolean.
         let is_full_input_block = local_values.is_full_input_block;
         yield_constr.constraint(is_full_input_block * (is_full_input_block - P::ONES));
@@ -542,13 +642,6 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
         yield_constr.constraint_transition(
             is_dummy * (next_values.is_full_input_block + next_is_final_block),
         );
-
-        // If this is a final block, is_final_input_len implies `len - already_absorbed == i`.
-        let offset = local_values.len - already_absorbed_bytes;
-        for (i, &is_final_len) in local_values.is_final_input_len.iter().enumerate() {
-            let entry_match = offset - P::from(FE::from_canonical_usize(i));
-            yield_constr.constraint(is_final_len * entry_match);
-        }
     }
 
     fn eval_ext_circuit(
@@ -566,6 +659,20 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
 
         let one = builder.one_extension();
 
+        // Check the range column: First value must be 0, last row
+        // must be 255, and intermediate rows must increment by 0
+        // or 1.
+        let rc1 = local_values.range_counter;
+        let rc2 = next_values.range_counter;
+        yield_constr.constraint_first_row(builder, rc1);
+        let incr = builder.sub_extension(rc2, rc1);
+        let t = builder.mul_sub_extension(incr, incr, incr);
+        yield_constr.constraint_transition(builder, t);
+        let range_max =
+            builder.constant_extension(F::Extension::from_canonical_usize(BYTE_RANGE_MAX - 1));
+        let t = builder.sub_extension(rc1, range_max);
+        yield_constr.constraint_last_row(builder, t);
+
         // Each flag (full-input block, final block or implied dummy flag) must be boolean.
         let is_full_input_block = local_values.is_full_input_block;
         let constraint = builder.mul_sub_extension(
@@ -686,39 +793,33 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
             builder.mul_extension(is_dummy, tmp)
         };
         yield_constr.constraint_transition(builder, constraint);
-
-        // If this is a final block, is_final_input_len implies `len - already_absorbed == i`.
-        let offset = builder.sub_extension(local_values.len, already_absorbed_bytes);
-        for (i, &is_final_len) in local_values.is_final_input_len.iter().enumerate() {
-            let index = builder.constant_extension(F::from_canonical_usize(i).into());
-            let entry_match = builder.sub_extension(offset, index);
-
-            let constraint = builder.mul_extension(is_final_len, entry_match);
-            yield_constr.constraint(builder, constraint);
-        }
     }
 
     fn constraint_degree(&self) -> usize {
         3
     }
+
+    fn lookups(&self) -> Vec<Lookup<F>> {
+        vec![Lookup {
+            columns: Column::singles(get_block_bytes_range()).collect(),
+            table_column: Column::single(RANGE_COUNTER),
+            frequencies_column: Column::single(RC_FREQUENCIES),
+            filter_columns: vec![None; KECCAK_RATE_BYTES],
+        }]
+    }
 }
 
 #[cfg(test)]
 mod tests {
-    use std::borrow::Borrow;
-
     use anyhow::Result;
-    use itertools::Itertools;
     use keccak_hash::keccak;
     use plonky2::field::goldilocks_field::GoldilocksField;
     use plonky2::field::types::PrimeField64;
     use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
 
-    use crate::keccak_sponge::columns::KeccakSpongeColumnsView;
-    use crate::keccak_sponge::keccak_sponge_stark::{KeccakSpongeOp, KeccakSpongeStark};
+    use super::*;
     use crate::memory::segments::Segment;
     use crate::stark_testing::{test_stark_circuit_constraints, test_stark_low_degree};
-    use crate::witness::memory::MemoryAddress;
 
     #[test]
     fn test_stark_degree() -> Result<()> {
@@ -752,11 +853,7 @@ mod tests {
         let expected_output = keccak(&input);
 
         let op = KeccakSpongeOp {
-            base_address: MemoryAddress {
-                context: 0,
-                segment: Segment::Code as usize,
-                virt: 0,
-            },
+            base_address: MemoryAddress::new(0, Segment::Code, 0),
             timestamp: 0,
             input,
         };
diff --git a/evm/src/lib.rs b/evm/src/lib.rs
index b678ec58e4..025fc8e63d 100644
--- a/evm/src/lib.rs
+++ b/evm/src/lib.rs
@@ -1,8 +1,168 @@
-#![allow(incomplete_features)]
+//! An implementation of a Type 1 zk-EVM by Polygon Zero.
+//!
+//! Following the [zk-EVM classification of V. Buterin](https://vitalik.eth.limo/general/2022/08/04/zkevm.html),
+//! the plonky2_evm crate aims at providing an efficient solution for the problem of generating cryptographic
+//! proofs of Ethereum-like transactions with *full Ethereum capability*.
+//!
+//! To this end, the plonky2 zk-EVM is tailored for an AIR-based STARK system satisfying degree 3 constraints,
+//! with support for recursive aggregation leveraging plonky2 circuits with FRI-based plonkish arithmetization.
+//! These circuits require a one-time, offline preprocessing phase.
+//! See the [`fixed_recursive_verifier`] module for more details on how this works.
+//! These preprocessed circuits are gathered within the [`AllRecursiveCircuits`] prover state,
+//! and can be generated as such:
+//!
+//! ```ignore
+//! // Specify the base field to use.
+//! type F = GoldilocksField;
+//! // Specify the extension degree to use.
+//! const D: usize = 2;
+//! // Specify the recursive configuration to use, here leveraging Poseidon hash
+//! // over the Goldilocks field both natively and in-circuit.
+//! type C = PoseidonGoldilocksConfig;
+//!
+//! let all_stark = AllStark::<F, D>::default();
+//! let config = StarkConfig::standard_fast_config();
+//!
+//! // Generate all the recursive circuits needed to generate succinct proofs for blocks.
+//! // The ranges correspond to the supported table sizes for each individual STARK component.
+//! let prover_state = AllRecursiveCircuits::<F, C, D>::new(
+//!     &all_stark,
+//!     &[16..25, 10..20, 12..25, 14..25, 9..20, 12..20, 17..30],
+//!     &config,
+//! );
+//! ```
+//!
+//! # Inputs type
+//!
+//! Transactions need to be processed into an Intermediary Representation (IR) format for the prover
+//! to be able to generate proofs of valid state transition. This involves passing the encoded transaction,
+//! the header of the block in which it was included, some information on the state prior execution
+//! of this transaction, etc.
+//! This intermediary representation is called [`GenerationInputs`].
+//!
+//!
+//! # Generating succinct proofs
+//!
+//! ## Transaction proofs
+//!
+//! To generate a proof for a transaction, given its [`GenerationInputs`] and an [`AllRecursiveCircuits`]
+//! prover state, one can simply call the [prove_root](AllRecursiveCircuits::prove_root) method.
+//!
+//! ```ignore
+//! let mut timing = TimingTree::new("prove", log::Level::Debug);
+//! let kill_signal = None; // Useful only with distributed proving to kill hanging jobs.
+//! let (proof, public_values) =
+//!     prover_state.prove_root(all_stark, config, inputs, &mut timing, kill_signal);
+//! ```
+//!
+//! This outputs a transaction proof and its associated public values. These are necessary during the
+//! aggregation levels (see below). If one were to miss the public values, they are also retrievable directly
+//! from the proof's encoded public inputs, as such:
+//!
+//! ```ignore
+//! let public_values = PublicValues::from_public_inputs(&proof.public_inputs);
+//! ```
+//!
+//! ## Aggregation proofs
+//!
+//! Because the plonky2 zkEVM generates proofs on a transaction basis, we then need to aggregate them for succinct
+//! verification. This is done in a binary tree fashion, where each inner node proof verifies two children proofs,
+//! through the [prove_aggregation](AllRecursiveCircuits::prove_aggregation) method.
+//! Note that the tree does *not* need to be complete, as this aggregation process can take as inputs both regular
+//! transaction proofs and aggregation proofs. We only need to specify for each child if it is an aggregation proof
+//! or a regular one.
+//!
+//! ```ignore
+//! let (proof_1, pv_1) =
+//!     prover_state.prove_root(all_stark, config, inputs_1, &mut timing, None);
+//! let (proof_2, pv_2) =
+//!     prover_state.prove_root(all_stark, config, inputs_2, &mut timing, None);
+//! let (proof_3, pv_3) =
+//!     prover_state.prove_root(all_stark, config, inputs_3, &mut timing, None);
+//!
+//! // Now aggregate proofs for txn 1 and 2.
+//! let (agg_proof_1_2, pv_1_2) =
+//!     prover_state.prove_aggregation(false, proof_1, pv_1, false, proof_2, pv_2);
+//!
+//! // Now aggregate the newly generated aggregation proof with the last regular txn proof.
+//! let (agg_proof_1_3, pv_1_3) =
+//!     prover_state.prove_aggregation(true, agg_proof_1_2, pv_1_2, false, proof_3, pv_3);
+//! ```
+//!
+//! **Note**: The proofs provided to the [prove_aggregation](AllRecursiveCircuits::prove_aggregation) method *MUST* have contiguous states.
+//! Trying to combine `proof_1` and `proof_3` from the example above would fail.
+//!
+//! ## Block proofs
+//!
+//! Once all transactions of a block have been proven and we are left with a single aggregation proof and its public values,
+//! we can then wrap it into a final block proof, attesting validity of the entire block.
+//! This [prove_block](AllRecursiveCircuits::prove_block) method accepts an optional previous block proof as argument,
+//! which will then try combining the previously proven block with the current one, generating a validity proof for both.
+//! Applying this process from genesis would yield a single proof attesting correctness of the entire chain.
+//!
+//! ```ignore
+//! let previous_block_proof = { ... };
+//! let (block_proof, block_public_values) =
+//!     prover_state.prove_block(Some(&previous_block_proof), &agg_proof, agg_pv)?;
+//! ```
+//!
+//! ### Checkpoint heights
+//!
+//! The process of always providing a previous block proof when generating a proof for the current block may yield some
+//! undesirable issues. For this reason, the plonky2 zk-EVM supports checkpoint heights. At given block heights,
+//! the prover does not have to pass a previous block proof. This would in practice correspond to block heights at which
+//! a proof has been generated and sent to L1 for settlement.
+//!
+//! The only requirement when generating a block proof without passing a previous one as argument is to have the
+//! `checkpoint_state_trie_root` metadata in the `PublicValues` of the final aggregation proof be matching the state
+//! trie before applying all the included transactions. If this condition is not met, the prover will fail to generate
+//! a valid proof.
+//!
+//!
+//! ```ignore
+//! let (block_proof, block_public_values) =
+//!     prover_state.prove_block(None, &agg_proof, agg_pv)?;
+//! ```
+//!
+//! # Prover state serialization
+//!
+//! Because the recursive circuits only need to be generated once, they can be saved to disk once the preprocessing phase
+//! completed successfully, and deserialized on-demand.
+//! The plonky2 zk-EVM provides serialization methods to convert the entire prover state to a vector of bytes, and vice-versa.
+//! This requires the use of custom serializers for gates and generators for proper recursive circuit encoding. This crate provides
+//! default serializers supporting all custom gates and associated generators defined within the [`plonky2`] crate.
+//!
+//! ```ignore
+//! let prover_state = AllRecursiveCircuits::<F, C, D>::new(...);
+//!
+//! // Default serializers
+//! let gate_serializer = DefaultGateSerializer;
+//! let generator_serializer = DefaultGeneratorSerializer::<C, D> {
+//!     _phantom: PhantomData::<C>,
+//! };
+//!
+//! // Serialize the prover state to a sequence of bytes
+//! let bytes = prover_state.to_bytes(false, &gate_serializer, &generator_serializer).unwrap();
+//!
+//! // Deserialize the bytes into a prover state
+//! let recovered_prover_state = AllRecursiveCircuits::<F, C, D>::from_bytes(
+//!     &all_circuits_bytes,
+//!     false,
+//!     &gate_serializer,
+//!     &generator_serializer,
+//! ).unwrap();
+//!
+//! assert_eq!(prover_state, recovered_prover_state);
+//! ```
+//!
+//! Note that an entire prover state built with wide ranges may be particularly large (up to ~25 GB), hence serialization methods,
+//! while faster than doing another preprocessing, may take some non-negligible time.
+
+#![cfg_attr(docsrs, feature(doc_cfg))]
 #![allow(clippy::needless_range_loop)]
 #![allow(clippy::too_many_arguments)]
-#![allow(clippy::type_complexity)]
 #![allow(clippy::field_reassign_with_default)]
+#![allow(unused)]
 #![feature(let_chains)]
 
 pub mod all_stark;
@@ -27,12 +187,14 @@ pub mod proof;
 pub mod prover;
 pub mod recursive_verifier;
 pub mod stark;
-pub mod stark_testing;
 pub mod util;
 pub mod vanishing_poly;
 pub mod verifier;
 pub mod witness;
 
+#[cfg(test)]
+mod stark_testing;
+
 use eth_trie_utils::partial_trie::HashedPartialTrie;
 // Set up Jemalloc
 #[cfg(not(target_env = "msvc"))]
@@ -42,4 +204,11 @@ use jemallocator::Jemalloc;
 #[global_allocator]
 static GLOBAL: Jemalloc = Jemalloc;
 
+// Public definitions and re-exports
+
 pub type Node = eth_trie_utils::partial_trie::Node<HashedPartialTrie>;
+
+pub use all_stark::AllStark;
+pub use config::StarkConfig;
+pub use fixed_recursive_verifier::AllRecursiveCircuits;
+pub use generation::GenerationInputs;
diff --git a/evm/src/logic.rs b/evm/src/logic.rs
index 319dfab2d0..7300c6af65 100644
--- a/evm/src/logic.rs
+++ b/evm/src/logic.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use ethereum_types::U256;
 use itertools::izip;
@@ -13,35 +13,43 @@ use plonky2::util::timing::TimingTree;
 use plonky2_util::ceil_div_usize;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::cross_table_lookup::Column;
 use crate::evaluation_frame::{StarkEvaluationFrame, StarkFrame};
 use crate::logic::columns::NUM_COLUMNS;
+use crate::lookup::{Column, Filter};
 use crate::stark::Stark;
 use crate::util::{limb_from_bits_le, limb_from_bits_le_recursive, trace_rows_to_poly_values};
 
-// Total number of bits per input/output.
+/// Total number of bits per input/output.
 const VAL_BITS: usize = 256;
-// Number of bits stored per field element. Ensure that this fits; it is not checked.
+/// Number of bits stored per field element. Ensure that this fits; it is not checked.
 pub(crate) const PACKED_LIMB_BITS: usize = 32;
-// Number of field elements needed to store each input/output at the specified packing.
+/// Number of field elements needed to store each input/output at the specified packing.
 const PACKED_LEN: usize = ceil_div_usize(VAL_BITS, PACKED_LIMB_BITS);
 
+/// `LogicStark` columns.
 pub(crate) mod columns {
-    use std::cmp::min;
-    use std::ops::Range;
+    use core::cmp::min;
+    use core::ops::Range;
 
     use super::{PACKED_LEN, PACKED_LIMB_BITS, VAL_BITS};
 
-    pub const IS_AND: usize = 0;
-    pub const IS_OR: usize = IS_AND + 1;
-    pub const IS_XOR: usize = IS_OR + 1;
-    // The inputs are decomposed into bits.
-    pub const INPUT0: Range<usize> = (IS_XOR + 1)..(IS_XOR + 1) + VAL_BITS;
-    pub const INPUT1: Range<usize> = INPUT0.end..INPUT0.end + VAL_BITS;
-    // The result is packed in limbs of `PACKED_LIMB_BITS` bits.
-    pub const RESULT: Range<usize> = INPUT1.end..INPUT1.end + PACKED_LEN;
-
-    pub fn limb_bit_cols_for_input(input_bits: Range<usize>) -> impl Iterator<Item = Range<usize>> {
+    /// 1 if this is an AND operation, 0 otherwise.
+    pub(crate) const IS_AND: usize = 0;
+    /// 1 if this is an OR operation, 0 otherwise.
+    pub(crate) const IS_OR: usize = IS_AND + 1;
+    /// 1 if this is a XOR operation, 0 otherwise.
+    pub(crate) const IS_XOR: usize = IS_OR + 1;
+    /// First input, decomposed into bits.
+    pub(crate) const INPUT0: Range<usize> = (IS_XOR + 1)..(IS_XOR + 1) + VAL_BITS;
+    /// Second input, decomposed into bits.
+    pub(crate) const INPUT1: Range<usize> = INPUT0.end..INPUT0.end + VAL_BITS;
+    /// The result is packed in limbs of `PACKED_LIMB_BITS` bits.
+    pub(crate) const RESULT: Range<usize> = INPUT1.end..INPUT1.end + PACKED_LEN;
+
+    /// Returns the column range for each 32 bit chunk in the input.
+    pub(crate) fn limb_bit_cols_for_input(
+        input_bits: Range<usize>,
+    ) -> impl Iterator<Item = Range<usize>> {
         (0..PACKED_LEN).map(move |i| {
             let start = input_bits.start + i * PACKED_LIMB_BITS;
             let end = min(start + PACKED_LIMB_BITS, input_bits.end);
@@ -49,10 +57,12 @@ pub(crate) mod columns {
         })
     }
 
-    pub const NUM_COLUMNS: usize = RESULT.end;
+    /// Number of columns in `LogicStark`.
+    pub(crate) const NUM_COLUMNS: usize = RESULT.end;
 }
 
-pub fn ctl_data<F: Field>() -> Vec<Column<F>> {
+/// Creates the vector of `Columns` corresponding to the opcode, the two inputs and the output of the logic operation.
+pub(crate) fn ctl_data<F: Field>() -> Vec<Column<F>> {
     // We scale each filter flag with the associated opcode value.
     // If a logic operation is happening on the CPU side, the CTL
     // will enforce that the reconstructed opcode value from the
@@ -68,15 +78,22 @@ pub fn ctl_data<F: Field>() -> Vec<Column<F>> {
     res
 }
 
-pub fn ctl_filter<F: Field>() -> Column<F> {
-    Column::sum([columns::IS_AND, columns::IS_OR, columns::IS_XOR])
+/// CTL filter for logic operations.
+pub(crate) fn ctl_filter<F: Field>() -> Filter<F> {
+    Filter::new_simple(Column::sum([
+        columns::IS_AND,
+        columns::IS_OR,
+        columns::IS_XOR,
+    ]))
 }
 
+/// Structure representing the Logic STARK, which computes all logic operations.
 #[derive(Copy, Clone, Default)]
-pub struct LogicStark<F, const D: usize> {
+pub(crate) struct LogicStark<F, const D: usize> {
     pub f: PhantomData<F>,
 }
 
+/// Logic operations.
 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
 pub(crate) enum Op {
     And,
@@ -85,6 +102,7 @@ pub(crate) enum Op {
 }
 
 impl Op {
+    /// Returns the output of the current Logic operation.
     pub(crate) fn result(&self, a: U256, b: U256) -> U256 {
         match self {
             Op::And => a & b,
@@ -94,6 +112,8 @@ impl Op {
     }
 }
 
+/// A logic operation over `U256`` words. It contains an operator,
+/// either `AND`, `OR` or `XOR`, two inputs and its expected result.
 #[derive(Debug)]
 pub(crate) struct Operation {
     operator: Op,
@@ -103,6 +123,8 @@ pub(crate) struct Operation {
 }
 
 impl Operation {
+    /// Computes the expected result of an operator with the two provided inputs,
+    /// and returns the associated logic `Operation`.
     pub(crate) fn new(operator: Op, input0: U256, input1: U256) -> Self {
         let result = operator.result(input0, input1);
         Operation {
@@ -113,6 +135,7 @@ impl Operation {
         }
     }
 
+    /// Given an `Operation`, fills a row with the corresponding flag, inputs and output.
     fn into_row<F: Field>(self) -> [F; NUM_COLUMNS] {
         let Operation {
             operator,
@@ -140,17 +163,20 @@ impl Operation {
 }
 
 impl<F: RichField, const D: usize> LogicStark<F, D> {
+    /// Generates the trace polynomials for `LogicStark`.
     pub(crate) fn generate_trace(
         &self,
         operations: Vec<Operation>,
         min_rows: usize,
         timing: &mut TimingTree,
     ) -> Vec<PolynomialValues<F>> {
+        // First, turn all provided operations into rows in `LogicStark`, and pad if necessary.
         let trace_rows = timed!(
             timing,
             "generate trace rows",
             self.generate_trace_rows(operations, min_rows)
         );
+        // Generate the trace polynomials from the trace values.
         let trace_polys = timed!(
             timing,
             "convert to PolynomialValues",
@@ -159,6 +185,8 @@ impl<F: RichField, const D: usize> LogicStark<F, D> {
         trace_polys
     }
 
+    /// Generate the `LogicStark` traces based on the provided vector of operations.
+    /// The trace is padded to a power of two with all-zero rows.
     fn generate_trace_rows(
         &self,
         operations: Vec<Operation>,
@@ -199,11 +227,19 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for LogicStark<F,
     {
         let lv = vars.get_local_values();
 
-        // IS_AND, IS_OR, and IS_XOR come from the CPU table, so we assume they're valid.
         let is_and = lv[columns::IS_AND];
         let is_or = lv[columns::IS_OR];
         let is_xor = lv[columns::IS_XOR];
 
+        // Flags must be boolean.
+        for &flag in &[is_and, is_or, is_xor] {
+            yield_constr.constraint(flag * (flag - P::ONES));
+        }
+
+        // Only a single flag must be activated at once.
+        let all_flags = is_and + is_or + is_xor;
+        yield_constr.constraint(all_flags * (all_flags - P::ONES));
+
         // The result will be `in0 OP in1 = sum_coeff * (in0 + in1) + and_coeff * (in0 AND in1)`.
         // `AND => sum_coeff = 0, and_coeff = 1`
         // `OR  => sum_coeff = 1, and_coeff = -1`
@@ -248,11 +284,21 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for LogicStark<F,
     ) {
         let lv = vars.get_local_values();
 
-        // IS_AND, IS_OR, and IS_XOR come from the CPU table, so we assume they're valid.
         let is_and = lv[columns::IS_AND];
         let is_or = lv[columns::IS_OR];
         let is_xor = lv[columns::IS_XOR];
 
+        // Flags must be boolean.
+        for &flag in &[is_and, is_or, is_xor] {
+            let constraint = builder.mul_sub_extension(flag, flag, flag);
+            yield_constr.constraint(builder, constraint);
+        }
+
+        // Only a single flag must be activated at once.
+        let all_flags = builder.add_many_extension([is_and, is_or, is_xor]);
+        let constraint = builder.mul_sub_extension(all_flags, all_flags, all_flags);
+        yield_constr.constraint(builder, constraint);
+
         // The result will be `in0 OP in1 = sum_coeff * (in0 + in1) + and_coeff * (in0 AND in1)`.
         // `AND => sum_coeff = 0, and_coeff = 1`
         // `OR  => sum_coeff = 1, and_coeff = -1`
diff --git a/evm/src/lookup.rs b/evm/src/lookup.rs
index a85544adc8..f98814f9a1 100644
--- a/evm/src/lookup.rs
+++ b/evm/src/lookup.rs
@@ -1,3 +1,7 @@
+use core::borrow::Borrow;
+use core::fmt::Debug;
+use core::iter::repeat;
+
 use itertools::Itertools;
 use num_bigint::BigUint;
 use plonky2::field::batch_util::batch_add_inplace;
@@ -9,25 +13,390 @@ use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::iop::target::Target;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2::plonk::plonk_common::{
+    reduce_with_powers, reduce_with_powers_circuit, reduce_with_powers_ext_circuit,
+};
 use plonky2_util::ceil_div_usize;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::evaluation_frame::StarkEvaluationFrame;
 use crate::stark::Stark;
 
-pub struct Lookup {
+/// Represents a filter, which evaluates to 1 if the row must be considered and 0 if it should be ignored.
+/// It's an arbitrary degree 2 combination of columns: `products` are the degree 2 terms, and `constants` are
+/// the degree 1 terms.
+#[derive(Clone, Debug)]
+pub(crate) struct Filter<F: Field> {
+    products: Vec<(Column<F>, Column<F>)>,
+    constants: Vec<Column<F>>,
+}
+
+impl<F: Field> Filter<F> {
+    pub(crate) fn new(products: Vec<(Column<F>, Column<F>)>, constants: Vec<Column<F>>) -> Self {
+        Self {
+            products,
+            constants,
+        }
+    }
+
+    /// Returns a filter made of a single column.
+    pub(crate) fn new_simple(col: Column<F>) -> Self {
+        Self {
+            products: vec![],
+            constants: vec![col],
+        }
+    }
+
+    /// Given the column values for the current and next rows, evaluates the filter.
+    pub(crate) fn eval_filter<FE, P, const D: usize>(&self, v: &[P], next_v: &[P]) -> P
+    where
+        FE: FieldExtension<D, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+    {
+        self.products
+            .iter()
+            .map(|(col1, col2)| col1.eval_with_next(v, next_v) * col2.eval_with_next(v, next_v))
+            .sum::<P>()
+            + self
+                .constants
+                .iter()
+                .map(|col| col.eval_with_next(v, next_v))
+                .sum::<P>()
+    }
+
+    /// Circuit version of `eval_filter`:
+    /// Given the column values for the current and next rows, evaluates the filter.
+    pub(crate) fn eval_filter_circuit<const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        v: &[ExtensionTarget<D>],
+        next_v: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D>
+    where
+        F: RichField + Extendable<D>,
+    {
+        let prods = self
+            .products
+            .iter()
+            .map(|(col1, col2)| {
+                let col1_eval = col1.eval_with_next_circuit(builder, v, next_v);
+                let col2_eval = col2.eval_with_next_circuit(builder, v, next_v);
+                builder.mul_extension(col1_eval, col2_eval)
+            })
+            .collect::<Vec<_>>();
+
+        let consts = self
+            .constants
+            .iter()
+            .map(|col| col.eval_with_next_circuit(builder, v, next_v))
+            .collect::<Vec<_>>();
+
+        let prods = builder.add_many_extension(prods);
+        let consts = builder.add_many_extension(consts);
+        builder.add_extension(prods, consts)
+    }
+
+    /// Evaluate on a row of a table given in column-major form.
+    pub(crate) fn eval_table(&self, table: &[PolynomialValues<F>], row: usize) -> F {
+        self.products
+            .iter()
+            .map(|(col1, col2)| col1.eval_table(table, row) * col2.eval_table(table, row))
+            .sum::<F>()
+            + self
+                .constants
+                .iter()
+                .map(|col| col.eval_table(table, row))
+                .sum()
+    }
+}
+
+/// Represent two linear combination of columns, corresponding to the current and next row values.
+/// Each linear combination is represented as:
+/// - a vector of `(usize, F)` corresponding to the column number and the associated multiplicand
+/// - the constant of the linear combination.
+#[derive(Clone, Debug)]
+pub(crate) struct Column<F: Field> {
+    linear_combination: Vec<(usize, F)>,
+    next_row_linear_combination: Vec<(usize, F)>,
+    constant: F,
+}
+
+impl<F: Field> Column<F> {
+    /// Returns the representation of a single column in the current row.
+    pub(crate) fn single(c: usize) -> Self {
+        Self {
+            linear_combination: vec![(c, F::ONE)],
+            next_row_linear_combination: vec![],
+            constant: F::ZERO,
+        }
+    }
+
+    /// Returns multiple single columns in the current row.
+    pub(crate) fn singles<I: IntoIterator<Item = impl Borrow<usize>>>(
+        cs: I,
+    ) -> impl Iterator<Item = Self> {
+        cs.into_iter().map(|c| Self::single(*c.borrow()))
+    }
+
+    /// Returns the representation of a single column in the next row.
+    pub(crate) fn single_next_row(c: usize) -> Self {
+        Self {
+            linear_combination: vec![],
+            next_row_linear_combination: vec![(c, F::ONE)],
+            constant: F::ZERO,
+        }
+    }
+
+    /// Returns multiple single columns for the next row.
+    pub(crate) fn singles_next_row<I: IntoIterator<Item = impl Borrow<usize>>>(
+        cs: I,
+    ) -> impl Iterator<Item = Self> {
+        cs.into_iter().map(|c| Self::single_next_row(*c.borrow()))
+    }
+
+    /// Returns a linear combination corresponding to a constant.
+    pub(crate) fn constant(constant: F) -> Self {
+        Self {
+            linear_combination: vec![],
+            next_row_linear_combination: vec![],
+            constant,
+        }
+    }
+
+    /// Returns a linear combination corresponding to 0.
+    pub(crate) fn zero() -> Self {
+        Self::constant(F::ZERO)
+    }
+
+    /// Returns a linear combination corresponding to 1.
+    pub(crate) fn one() -> Self {
+        Self::constant(F::ONE)
+    }
+
+    /// Given an iterator of `(usize, F)` and a constant, returns the association linear combination of columns for the current row.
+    pub(crate) fn linear_combination_with_constant<I: IntoIterator<Item = (usize, F)>>(
+        iter: I,
+        constant: F,
+    ) -> Self {
+        let v = iter.into_iter().collect::<Vec<_>>();
+        assert!(!v.is_empty());
+        debug_assert_eq!(
+            v.iter().map(|(c, _)| c).unique().count(),
+            v.len(),
+            "Duplicate columns."
+        );
+        Self {
+            linear_combination: v,
+            next_row_linear_combination: vec![],
+            constant,
+        }
+    }
+
+    /// Given an iterator of `(usize, F)` and a constant, returns the associated linear combination of columns for the current and the next rows.
+    pub(crate) fn linear_combination_and_next_row_with_constant<
+        I: IntoIterator<Item = (usize, F)>,
+    >(
+        iter: I,
+        next_row_iter: I,
+        constant: F,
+    ) -> Self {
+        let v = iter.into_iter().collect::<Vec<_>>();
+        let next_row_v = next_row_iter.into_iter().collect::<Vec<_>>();
+
+        assert!(!v.is_empty() || !next_row_v.is_empty());
+        debug_assert_eq!(
+            v.iter().map(|(c, _)| c).unique().count(),
+            v.len(),
+            "Duplicate columns."
+        );
+        debug_assert_eq!(
+            next_row_v.iter().map(|(c, _)| c).unique().count(),
+            next_row_v.len(),
+            "Duplicate columns."
+        );
+
+        Self {
+            linear_combination: v,
+            next_row_linear_combination: next_row_v,
+            constant,
+        }
+    }
+
+    /// Returns a linear combination of columns, with no additional constant.
+    pub(crate) fn linear_combination<I: IntoIterator<Item = (usize, F)>>(iter: I) -> Self {
+        Self::linear_combination_with_constant(iter, F::ZERO)
+    }
+
+    /// Given an iterator of columns (c_0, ..., c_n) containing bits in little endian order:
+    /// returns the representation of c_0 + 2 * c_1 + ... + 2^n * c_n.
+    pub(crate) fn le_bits<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
+        Self::linear_combination(cs.into_iter().map(|c| *c.borrow()).zip(F::TWO.powers()))
+    }
+
+    /// Given an iterator of columns (c_0, ..., c_n) containing bits in little endian order:
+    /// returns the representation of c_0 + 2 * c_1 + ... + 2^n * c_n + k where `k` is an
+    /// additional constant.
+    pub(crate) fn le_bits_with_constant<I: IntoIterator<Item = impl Borrow<usize>>>(
+        cs: I,
+        constant: F,
+    ) -> Self {
+        Self::linear_combination_with_constant(
+            cs.into_iter().map(|c| *c.borrow()).zip(F::TWO.powers()),
+            constant,
+        )
+    }
+
+    /// Given an iterator of columns (c_0, ..., c_n) containing bytes in little endian order:
+    /// returns the representation of c_0 + 256 * c_1 + ... + 256^n * c_n.
+    pub(crate) fn le_bytes<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
+        Self::linear_combination(
+            cs.into_iter()
+                .map(|c| *c.borrow())
+                .zip(F::from_canonical_u16(256).powers()),
+        )
+    }
+
+    /// Given an iterator of columns, returns the representation of their sum.
+    pub(crate) fn sum<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
+        Self::linear_combination(cs.into_iter().map(|c| *c.borrow()).zip(repeat(F::ONE)))
+    }
+
+    /// Given the column values for the current row, returns the evaluation of the linear combination.
+    pub(crate) fn eval<FE, P, const D: usize>(&self, v: &[P]) -> P
+    where
+        FE: FieldExtension<D, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+    {
+        self.linear_combination
+            .iter()
+            .map(|&(c, f)| v[c] * FE::from_basefield(f))
+            .sum::<P>()
+            + FE::from_basefield(self.constant)
+    }
+
+    /// Given the column values for the current and next rows, evaluates the current and next linear combinations and returns their sum.
+    pub(crate) fn eval_with_next<FE, P, const D: usize>(&self, v: &[P], next_v: &[P]) -> P
+    where
+        FE: FieldExtension<D, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+    {
+        self.linear_combination
+            .iter()
+            .map(|&(c, f)| v[c] * FE::from_basefield(f))
+            .sum::<P>()
+            + self
+                .next_row_linear_combination
+                .iter()
+                .map(|&(c, f)| next_v[c] * FE::from_basefield(f))
+                .sum::<P>()
+            + FE::from_basefield(self.constant)
+    }
+
+    /// Evaluate on a row of a table given in column-major form.
+    pub(crate) fn eval_table(&self, table: &[PolynomialValues<F>], row: usize) -> F {
+        let mut res = self
+            .linear_combination
+            .iter()
+            .map(|&(c, f)| table[c].values[row] * f)
+            .sum::<F>()
+            + self.constant;
+
+        // If we access the next row at the last row, for sanity, we consider the next row's values to be 0.
+        // If the lookups are correctly written, the filter should be 0 in that case anyway.
+        if !self.next_row_linear_combination.is_empty() && row < table[0].values.len() - 1 {
+            res += self
+                .next_row_linear_combination
+                .iter()
+                .map(|&(c, f)| table[c].values[row + 1] * f)
+                .sum::<F>();
+        }
+
+        res
+    }
+
+    /// Evaluates the column on all rows.
+    pub(crate) fn eval_all_rows(&self, table: &[PolynomialValues<F>]) -> Vec<F> {
+        let length = table[0].len();
+        (0..length)
+            .map(|row| self.eval_table(table, row))
+            .collect::<Vec<F>>()
+    }
+
+    /// Circuit version of `eval`: Given a row's targets, returns their linear combination.
+    pub(crate) fn eval_circuit<const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        v: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D>
+    where
+        F: RichField + Extendable<D>,
+    {
+        let pairs = self
+            .linear_combination
+            .iter()
+            .map(|&(c, f)| {
+                (
+                    v[c],
+                    builder.constant_extension(F::Extension::from_basefield(f)),
+                )
+            })
+            .collect::<Vec<_>>();
+        let constant = builder.constant_extension(F::Extension::from_basefield(self.constant));
+        builder.inner_product_extension(F::ONE, constant, pairs)
+    }
+
+    /// Circuit version of `eval_with_next`:
+    /// Given the targets of the current and next row, returns the sum of their linear combinations.
+    pub(crate) fn eval_with_next_circuit<const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        v: &[ExtensionTarget<D>],
+        next_v: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D>
+    where
+        F: RichField + Extendable<D>,
+    {
+        let mut pairs = self
+            .linear_combination
+            .iter()
+            .map(|&(c, f)| {
+                (
+                    v[c],
+                    builder.constant_extension(F::Extension::from_basefield(f)),
+                )
+            })
+            .collect::<Vec<_>>();
+        let next_row_pairs = self.next_row_linear_combination.iter().map(|&(c, f)| {
+            (
+                next_v[c],
+                builder.constant_extension(F::Extension::from_basefield(f)),
+            )
+        });
+        pairs.extend(next_row_pairs);
+        let constant = builder.constant_extension(F::Extension::from_basefield(self.constant));
+        builder.inner_product_extension(F::ONE, constant, pairs)
+    }
+}
+
+pub(crate) type ColumnFilter<'a, F> = (&'a [Column<F>], &'a Option<Filter<F>>);
+
+pub struct Lookup<F: Field> {
     /// Columns whose values should be contained in the lookup table.
     /// These are the f_i(x) polynomials in the logUp paper.
-    pub(crate) columns: Vec<usize>,
+    pub(crate) columns: Vec<Column<F>>,
     /// Column containing the lookup table.
     /// This is the t(x) polynomial in the paper.
-    pub(crate) table_column: usize,
+    pub(crate) table_column: Column<F>,
     /// Column containing the frequencies of `columns` in `table_column`.
     /// This is the m(x) polynomial in the paper.
-    pub(crate) frequencies_column: usize,
+    pub(crate) frequencies_column: Column<F>,
+
+    /// Columns to filter some elements. There is at most one filter
+    /// column per column to range-check.
+    pub(crate) filter_columns: Vec<Option<Filter<F>>>,
 }
 
-impl Lookup {
+impl<F: Field> Lookup<F> {
     pub(crate) fn num_helper_columns(&self, constraint_degree: usize) -> usize {
         // One helper column for each column batch of size `constraint_degree-1`,
         // then one column for the inverse of `table + challenge` and one for the `Z` polynomial.
@@ -35,13 +404,59 @@ impl Lookup {
     }
 }
 
-/// logUp protocol from https://ia.cr/2022/1530
+/// Randomness for a single instance of a permutation check protocol.
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub(crate) struct GrandProductChallenge<T: Copy + Eq + PartialEq + Debug> {
+    /// Randomness used to combine multiple columns into one.
+    pub(crate) beta: T,
+    /// Random offset that's added to the beta-reduced column values.
+    pub(crate) gamma: T,
+}
+
+impl<F: Field> GrandProductChallenge<F> {
+    pub(crate) fn combine<'a, FE, P, T: IntoIterator<Item = &'a P>, const D2: usize>(
+        &self,
+        terms: T,
+    ) -> P
+    where
+        FE: FieldExtension<D2, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+        T::IntoIter: DoubleEndedIterator,
+    {
+        reduce_with_powers(terms, FE::from_basefield(self.beta)) + FE::from_basefield(self.gamma)
+    }
+}
+
+impl GrandProductChallenge<Target> {
+    pub(crate) fn combine_circuit<F: RichField + Extendable<D>, const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        terms: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D> {
+        let reduced = reduce_with_powers_ext_circuit(builder, terms, self.beta);
+        let gamma = builder.convert_to_ext(self.gamma);
+        builder.add_extension(reduced, gamma)
+    }
+}
+
+impl GrandProductChallenge<Target> {
+    pub(crate) fn combine_base_circuit<F: RichField + Extendable<D>, const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        terms: &[Target],
+    ) -> Target {
+        let reduced = reduce_with_powers_circuit(builder, terms, self.beta);
+        builder.add(reduced, self.gamma)
+    }
+}
+
+/// logUp protocol from <https://ia.cr/2022/1530>
 /// Compute the helper columns for the lookup argument.
 /// Given columns `f0,...,fk` and a column `t`, such that `∪fi ⊆ t`, and challenges `x`,
 /// this computes the helper columns `h_i = 1/(x+f_2i) + 1/(x+f_2i+1)`, `g = 1/(x+t)`,
 /// and `Z(gx) = Z(x) + sum h_i(x) - m(x)g(x)` where `m` is the frequencies column.
 pub(crate) fn lookup_helper_columns<F: Field>(
-    lookup: &Lookup,
+    lookup: &Lookup<F>,
     trace_poly_values: &[PolynomialValues<F>],
     challenge: F,
     constraint_degree: usize,
@@ -51,46 +466,50 @@ pub(crate) fn lookup_helper_columns<F: Field>(
         "TODO: Allow other constraint degrees."
     );
 
+    assert_eq!(lookup.columns.len(), lookup.filter_columns.len());
+
     let num_total_logup_entries = trace_poly_values[0].values.len() * lookup.columns.len();
     assert!(BigUint::from(num_total_logup_entries) < F::characteristic());
 
     let num_helper_columns = lookup.num_helper_columns(constraint_degree);
-    let mut helper_columns: Vec<PolynomialValues<F>> = Vec::with_capacity(num_helper_columns);
 
+    let looking_cols = lookup
+        .columns
+        .iter()
+        .map(|col| vec![col.clone()])
+        .collect::<Vec<Vec<Column<F>>>>();
+
+    let grand_challenge = GrandProductChallenge {
+        beta: F::ONE,
+        gamma: challenge,
+    };
+
+    let columns_filters = looking_cols
+        .iter()
+        .zip(lookup.filter_columns.iter())
+        .map(|(col, filter)| (&col[..], filter))
+        .collect::<Vec<_>>();
     // For each batch of `constraint_degree-1` columns `fi`, compute `sum 1/(f_i+challenge)` and
     // add it to the helper columns.
-    // TODO: This does one batch inversion per column. It would also be possible to do one batch inversion
-    // for every group of columns, but that would require building a big vector of all the columns concatenated.
-    // Not sure which approach is better.
     // Note: these are the h_k(x) polynomials in the paper, with a few differences:
     //       * Here, the first ratio m_0(x)/phi_0(x) is not included with the columns batched up to create the
     //         h_k polynomials; instead there's a separate helper column for it (see below).
     //       * Here, we use 1 instead of -1 as the numerator (and subtract later).
     //       * Here, for now, the batch size (l) is always constraint_degree - 1 = 2.
-    for mut col_inds in &lookup.columns.iter().chunks(constraint_degree - 1) {
-        let first = *col_inds.next().unwrap();
-        // TODO: The clone could probably be avoided by using a modified version of `batch_multiplicative_inverse`
-        // taking `challenge` as an additional argument.
-        let mut column = trace_poly_values[first].values.clone();
-        for x in column.iter_mut() {
-            *x = challenge + *x;
-        }
-        let mut acc = F::batch_multiplicative_inverse(&column);
-        for &ind in col_inds {
-            let mut column = trace_poly_values[ind].values.clone();
-            for x in column.iter_mut() {
-                *x = challenge + *x;
-            }
-            column = F::batch_multiplicative_inverse(&column);
-            batch_add_inplace(&mut acc, &column);
-        }
-        helper_columns.push(acc.into());
-    }
+    //       * Here, there are filters for the columns, to only select some rows
+    //         in a given column.
+    let mut helper_columns = get_helper_cols(
+        trace_poly_values,
+        trace_poly_values[0].len(),
+        &columns_filters,
+        grand_challenge,
+        constraint_degree,
+    );
 
     // Add `1/(table+challenge)` to the helper columns.
     // This is 1/phi_0(x) = 1/(x + t(x)) from the paper.
     // Here, we don't include m(x) in the numerator, instead multiplying it with this column later.
-    let mut table = trace_poly_values[lookup.table_column].values.clone();
+    let mut table = lookup.table_column.eval_all_rows(trace_poly_values);
     for x in table.iter_mut() {
         *x = challenge + *x;
     }
@@ -100,7 +519,7 @@ pub(crate) fn lookup_helper_columns<F: Field>(
     // This enforces the check from the paper, that the sum of the h_k(x) polynomials is 0 over H.
     // In the paper, that sum includes m(x)/(x + t(x)) = frequencies(x)/g(x), because that was bundled
     // into the h_k(x) polynomials.
-    let frequencies = &trace_poly_values[lookup.frequencies_column].values;
+    let frequencies = &lookup.frequencies_column.eval_all_rows(trace_poly_values);
     let mut z = Vec::with_capacity(frequencies.len());
     z.push(F::ZERO);
     for i in 0..frequencies.len() - 1 {
@@ -116,7 +535,214 @@ pub(crate) fn lookup_helper_columns<F: Field>(
     helper_columns
 }
 
-pub struct LookupCheckVars<F, FE, P, const D2: usize>
+/// Given data associated to a lookup, check the associated helper polynomials.
+pub(crate) fn eval_helper_columns<F, FE, P, const D: usize, const D2: usize>(
+    filter: &[Option<Filter<F>>],
+    columns: &[Vec<P>],
+    local_values: &[P],
+    next_values: &[P],
+    helper_columns: &[P],
+    constraint_degree: usize,
+    challenges: &GrandProductChallenge<F>,
+    consumer: &mut ConstraintConsumer<P>,
+) where
+    F: RichField + Extendable<D>,
+    FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
+{
+    if !helper_columns.is_empty() {
+        for (j, chunk) in columns.chunks(constraint_degree - 1).enumerate() {
+            let fs =
+                &filter[(constraint_degree - 1) * j..(constraint_degree - 1) * j + chunk.len()];
+            let h = helper_columns[j];
+
+            match chunk.len() {
+                2 => {
+                    let combin0 = challenges.combine(&chunk[0]);
+                    let combin1 = challenges.combine(chunk[1].iter());
+
+                    let f0 = if let Some(filter0) = &fs[0] {
+                        filter0.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+                    let f1 = if let Some(filter1) = &fs[1] {
+                        filter1.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+
+                    consumer.constraint(combin1 * combin0 * h - f0 * combin1 - f1 * combin0);
+                }
+                1 => {
+                    let combin = challenges.combine(&chunk[0]);
+                    let f0 = if let Some(filter1) = &fs[0] {
+                        filter1.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+                    consumer.constraint(combin * h - f0);
+                }
+
+                _ => todo!("Allow other constraint degrees"),
+            }
+        }
+    }
+}
+
+/// Circuit version of `eval_helper_columns`.
+/// Given data associated to a lookup (either a CTL or a range-check), check the associated helper polynomials.
+pub(crate) fn eval_helper_columns_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    filter: &[Option<Filter<F>>],
+    columns: &[Vec<ExtensionTarget<D>>],
+    local_values: &[ExtensionTarget<D>],
+    next_values: &[ExtensionTarget<D>],
+    helper_columns: &[ExtensionTarget<D>],
+    constraint_degree: usize,
+    challenges: &GrandProductChallenge<Target>,
+    consumer: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    if !helper_columns.is_empty() {
+        for (j, chunk) in columns.chunks(constraint_degree - 1).enumerate() {
+            let fs =
+                &filter[(constraint_degree - 1) * j..(constraint_degree - 1) * j + chunk.len()];
+            let h = helper_columns[j];
+
+            let one = builder.one_extension();
+            match chunk.len() {
+                2 => {
+                    let combin0 = challenges.combine_circuit(builder, &chunk[0]);
+                    let combin1 = challenges.combine_circuit(builder, &chunk[1]);
+
+                    let f0 = if let Some(filter0) = &fs[0] {
+                        filter0.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+                    let f1 = if let Some(filter1) = &fs[1] {
+                        filter1.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+
+                    let constr = builder.mul_sub_extension(combin0, h, f0);
+                    let constr = builder.mul_extension(constr, combin1);
+                    let f1_constr = builder.mul_extension(f1, combin0);
+                    let constr = builder.sub_extension(constr, f1_constr);
+
+                    consumer.constraint(builder, constr);
+                }
+                1 => {
+                    let combin = challenges.combine_circuit(builder, &chunk[0]);
+                    let f0 = if let Some(filter1) = &fs[0] {
+                        filter1.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+                    let constr = builder.mul_sub_extension(combin, h, f0);
+                    consumer.constraint(builder, constr);
+                }
+
+                _ => todo!("Allow other constraint degrees"),
+            }
+        }
+    }
+}
+
+/// Given a STARK's trace, and the data associated to one lookup (either CTL or range check),
+/// returns the associated helper polynomials.
+pub(crate) fn get_helper_cols<F: Field>(
+    trace: &[PolynomialValues<F>],
+    degree: usize,
+    columns_filters: &[ColumnFilter<F>],
+    challenge: GrandProductChallenge<F>,
+    constraint_degree: usize,
+) -> Vec<PolynomialValues<F>> {
+    let num_helper_columns = ceil_div_usize(columns_filters.len(), constraint_degree - 1);
+
+    let mut helper_columns = Vec::with_capacity(num_helper_columns);
+
+    for mut cols_filts in &columns_filters.iter().chunks(constraint_degree - 1) {
+        let (first_col, first_filter) = cols_filts.next().unwrap();
+
+        let mut filter_col = Vec::with_capacity(degree);
+        let first_combined = (0..degree)
+            .map(|d| {
+                let f = if let Some(filter) = first_filter {
+                    let f = filter.eval_table(trace, d);
+                    filter_col.push(f);
+                    f
+                } else {
+                    filter_col.push(F::ONE);
+                    F::ONE
+                };
+                if f.is_one() {
+                    let evals = first_col
+                        .iter()
+                        .map(|c| c.eval_table(trace, d))
+                        .collect::<Vec<F>>();
+                    challenge.combine(evals.iter())
+                } else {
+                    assert_eq!(f, F::ZERO, "Non-binary filter?");
+                    // Dummy value. Cannot be zero since it will be batch-inverted.
+                    F::ONE
+                }
+            })
+            .collect::<Vec<F>>();
+
+        let mut acc = F::batch_multiplicative_inverse(&first_combined);
+        for d in 0..degree {
+            if filter_col[d].is_zero() {
+                acc[d] = F::ZERO;
+            }
+        }
+
+        for (col, filt) in cols_filts {
+            let mut filter_col = Vec::with_capacity(degree);
+            let mut combined = (0..degree)
+                .map(|d| {
+                    let f = if let Some(filter) = filt {
+                        let f = filter.eval_table(trace, d);
+                        filter_col.push(f);
+                        f
+                    } else {
+                        filter_col.push(F::ONE);
+                        F::ONE
+                    };
+                    if f.is_one() {
+                        let evals = col
+                            .iter()
+                            .map(|c| c.eval_table(trace, d))
+                            .collect::<Vec<F>>();
+                        challenge.combine(evals.iter())
+                    } else {
+                        assert_eq!(f, F::ZERO, "Non-binary filter?");
+                        // Dummy value. Cannot be zero since it will be batch-inverted.
+                        F::ONE
+                    }
+                })
+                .collect::<Vec<F>>();
+
+            combined = F::batch_multiplicative_inverse(&combined);
+
+            for d in 0..degree {
+                if filter_col[d].is_zero() {
+                    combined[d] = F::ZERO;
+                }
+            }
+
+            batch_add_inplace(&mut acc, &combined);
+        }
+
+        helper_columns.push(acc.into());
+    }
+    assert_eq!(helper_columns.len(), num_helper_columns);
+
+    helper_columns
+}
+
+pub(crate) struct LookupCheckVars<F, FE, P, const D2: usize>
 where
     F: Field,
     FE: FieldExtension<D2, BaseField = F>,
@@ -130,7 +756,7 @@ where
 /// Constraints for the logUp lookup argument.
 pub(crate) fn eval_packed_lookups_generic<F, FE, P, S, const D: usize, const D2: usize>(
     stark: &S,
-    lookups: &[Lookup],
+    lookups: &[Lookup<F>],
     vars: &S::EvaluationFrame<FE, P, D2>,
     lookup_vars: LookupCheckVars<F, FE, P, D2>,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -140,46 +766,57 @@ pub(crate) fn eval_packed_lookups_generic<F, FE, P, S, const D: usize, const D2:
     P: PackedField<Scalar = FE>,
     S: Stark<F, D>,
 {
+    let local_values = vars.get_local_values();
+    let next_values = vars.get_next_values();
     let degree = stark.constraint_degree();
     assert_eq!(degree, 3, "TODO: Allow other constraint degrees.");
     let mut start = 0;
     for lookup in lookups {
         let num_helper_columns = lookup.num_helper_columns(degree);
         for &challenge in &lookup_vars.challenges {
+            let grand_challenge = GrandProductChallenge {
+                beta: F::ONE,
+                gamma: challenge,
+            };
+            let lookup_columns = lookup
+                .columns
+                .iter()
+                .map(|col| vec![col.eval_with_next(local_values, next_values)])
+                .collect::<Vec<Vec<P>>>();
+
+            // For each chunk, check that `h_i (x+f_2i) (x+f_{2i+1}) = (x+f_2i) * filter_{2i+1} + (x+f_{2i+1}) * filter_2i` if the chunk has length 2
+            // or if it has length 1, check that `h_i * (x+f_2i) = filter_2i`, where x is the challenge
+            eval_helper_columns(
+                &lookup.filter_columns,
+                &lookup_columns,
+                local_values,
+                next_values,
+                &lookup_vars.local_values[start..start + num_helper_columns - 1],
+                degree,
+                &grand_challenge,
+                yield_constr,
+            );
+
             let challenge = FE::from_basefield(challenge);
-            // For each chunk, check that `h_i (x+f_2i) (x+f_2i+1) = (x+f_2i) + (x+f_2i+1)` if the chunk has length 2
-            // or if it has length 1, check that `h_i * (x+f_2i) = 1`, where x is the challenge
-            for (j, chunk) in lookup.columns.chunks(degree - 1).enumerate() {
-                let mut x = lookup_vars.local_values[start + j];
-                let mut y = P::ZEROS;
-                let fs = chunk.iter().map(|&k| vars.get_local_values()[k]);
-                for f in fs {
-                    x *= f + challenge;
-                    y += f + challenge;
-                }
-                match chunk.len() {
-                    2 => yield_constr.constraint(x - y),
-                    1 => yield_constr.constraint(x - P::ONES),
-                    _ => todo!("Allow other constraint degrees."),
-                }
-            }
 
             // Check the `Z` polynomial.
             let z = lookup_vars.local_values[start + num_helper_columns - 1];
             let next_z = lookup_vars.next_values[start + num_helper_columns - 1];
-            let table_with_challenge = vars.get_local_values()[lookup.table_column] + challenge;
+            let table_with_challenge = lookup.table_column.eval(local_values) + challenge;
             let y = lookup_vars.local_values[start..start + num_helper_columns - 1]
                 .iter()
                 .fold(P::ZEROS, |acc, x| acc + *x)
                 * table_with_challenge
-                - vars.get_local_values()[lookup.frequencies_column];
+                - lookup.frequencies_column.eval(local_values);
+            // Check that in the first row, z = 0;
+            yield_constr.constraint_first_row(z);
             yield_constr.constraint((next_z - z) * table_with_challenge - y);
             start += num_helper_columns;
         }
     }
 }
 
-pub struct LookupCheckVarsTarget<const D: usize> {
+pub(crate) struct LookupCheckVarsTarget<const D: usize> {
     pub(crate) local_values: Vec<ExtensionTarget<D>>,
     pub(crate) next_values: Vec<ExtensionTarget<D>>,
     pub(crate) challenges: Vec<Target>,
@@ -196,48 +833,58 @@ pub(crate) fn eval_ext_lookups_circuit<
     lookup_vars: LookupCheckVarsTarget<D>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let one = builder.one_extension();
     let degree = stark.constraint_degree();
     let lookups = stark.lookups();
+
+    let local_values = vars.get_local_values();
+    let next_values = vars.get_next_values();
     assert_eq!(degree, 3, "TODO: Allow other constraint degrees.");
     let mut start = 0;
     for lookup in lookups {
         let num_helper_columns = lookup.num_helper_columns(degree);
+        let col_values = lookup
+            .columns
+            .iter()
+            .map(|col| vec![col.eval_with_next_circuit(builder, local_values, next_values)])
+            .collect::<Vec<_>>();
+
         for &challenge in &lookup_vars.challenges {
+            let grand_challenge = GrandProductChallenge {
+                beta: builder.one(),
+                gamma: challenge,
+            };
+
+            eval_helper_columns_circuit(
+                builder,
+                &lookup.filter_columns,
+                &col_values,
+                local_values,
+                next_values,
+                &lookup_vars.local_values[start..start + num_helper_columns - 1],
+                degree,
+                &grand_challenge,
+                yield_constr,
+            );
             let challenge = builder.convert_to_ext(challenge);
-            for (j, chunk) in lookup.columns.chunks(degree - 1).enumerate() {
-                let mut x = lookup_vars.local_values[start + j];
-                let mut y = builder.zero_extension();
-                let fs = chunk.iter().map(|&k| vars.get_local_values()[k]);
-                for f in fs {
-                    let tmp = builder.add_extension(f, challenge);
-                    x = builder.mul_extension(x, tmp);
-                    y = builder.add_extension(y, tmp);
-                }
-                match chunk.len() {
-                    2 => {
-                        let tmp = builder.sub_extension(x, y);
-                        yield_constr.constraint(builder, tmp)
-                    }
-                    1 => {
-                        let tmp = builder.sub_extension(x, one);
-                        yield_constr.constraint(builder, tmp)
-                    }
-                    _ => todo!("Allow other constraint degrees."),
-                }
-            }
 
             let z = lookup_vars.local_values[start + num_helper_columns - 1];
             let next_z = lookup_vars.next_values[start + num_helper_columns - 1];
-            let table_with_challenge =
-                builder.add_extension(vars.get_local_values()[lookup.table_column], challenge);
+            let table_column = lookup
+                .table_column
+                .eval_circuit(builder, vars.get_local_values());
+            let table_with_challenge = builder.add_extension(table_column, challenge);
             let mut y = builder.add_many_extension(
                 &lookup_vars.local_values[start..start + num_helper_columns - 1],
             );
 
+            let frequencies_column = lookup
+                .frequencies_column
+                .eval_circuit(builder, vars.get_local_values());
             y = builder.mul_extension(y, table_with_challenge);
-            y = builder.sub_extension(y, vars.get_local_values()[lookup.frequencies_column]);
+            y = builder.sub_extension(y, frequencies_column);
 
+            // Check that in the first row, z = 0;
+            yield_constr.constraint_first_row(builder, z);
             let mut constraint = builder.sub_extension(next_z, z);
             constraint = builder.mul_extension(constraint, table_with_challenge);
             constraint = builder.sub_extension(constraint, y);
diff --git a/evm/src/memory/columns.rs b/evm/src/memory/columns.rs
index 9a41323200..2010bf33ec 100644
--- a/evm/src/memory/columns.rs
+++ b/evm/src/memory/columns.rs
@@ -5,10 +5,18 @@ use crate::memory::VALUE_LIMBS;
 // Columns for memory operations, ordered by (addr, timestamp).
 /// 1 if this is an actual memory operation, or 0 if it's a padding row.
 pub(crate) const FILTER: usize = 0;
+/// Each memory operation is associated to a unique timestamp.
+/// For a given memory operation `op_i`, its timestamp is computed as `C * N + i`
+/// where `C` is the CPU clock at that time, `N` is the number of general memory channels,
+/// and `i` is the index of the memory channel at which the memory operation is performed.
 pub(crate) const TIMESTAMP: usize = FILTER + 1;
+/// 1 if this is a read operation, 0 if it is a write one.
 pub(crate) const IS_READ: usize = TIMESTAMP + 1;
+/// The execution context of this address.
 pub(crate) const ADDR_CONTEXT: usize = IS_READ + 1;
+/// The segment section of this address.
 pub(crate) const ADDR_SEGMENT: usize = ADDR_CONTEXT + 1;
+/// The virtual address within the given context and segment.
 pub(crate) const ADDR_VIRTUAL: usize = ADDR_SEGMENT + 1;
 
 // Eight 32-bit limbs hold a total of 256 bits.
@@ -27,8 +35,12 @@ pub(crate) const CONTEXT_FIRST_CHANGE: usize = VALUE_START + VALUE_LIMBS;
 pub(crate) const SEGMENT_FIRST_CHANGE: usize = CONTEXT_FIRST_CHANGE + 1;
 pub(crate) const VIRTUAL_FIRST_CHANGE: usize = SEGMENT_FIRST_CHANGE + 1;
 
+// Used to lower the degree of the zero-initializing constraints.
+// Contains `next_segment * addr_changed * next_is_read`.
+pub(crate) const INITIALIZE_AUX: usize = VIRTUAL_FIRST_CHANGE + 1;
+
 // We use a range check to enforce the ordering.
-pub(crate) const RANGE_CHECK: usize = VIRTUAL_FIRST_CHANGE + 1;
+pub(crate) const RANGE_CHECK: usize = INITIALIZE_AUX + 1;
 /// The counter column (used for the range check) starts from 0 and increments.
 pub(crate) const COUNTER: usize = RANGE_CHECK + 1;
 /// The frequencies column used in logUp.
diff --git a/evm/src/memory/memory_stark.rs b/evm/src/memory/memory_stark.rs
index 4a63f50a7a..44d2af6ae2 100644
--- a/evm/src/memory/memory_stark.rs
+++ b/evm/src/memory/memory_stark.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use ethereum_types::U256;
 use itertools::Itertools;
@@ -13,21 +13,26 @@ use plonky2::util::timing::TimingTree;
 use plonky2::util::transpose;
 use plonky2_maybe_rayon::*;
 
+use super::segments::Segment;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::cross_table_lookup::Column;
 use crate::evaluation_frame::{StarkEvaluationFrame, StarkFrame};
-use crate::lookup::Lookup;
+use crate::lookup::{Column, Filter, Lookup};
 use crate::memory::columns::{
     value_limb, ADDR_CONTEXT, ADDR_SEGMENT, ADDR_VIRTUAL, CONTEXT_FIRST_CHANGE, COUNTER, FILTER,
-    FREQUENCIES, IS_READ, NUM_COLUMNS, RANGE_CHECK, SEGMENT_FIRST_CHANGE, TIMESTAMP,
-    VIRTUAL_FIRST_CHANGE,
+    FREQUENCIES, INITIALIZE_AUX, IS_READ, NUM_COLUMNS, RANGE_CHECK, SEGMENT_FIRST_CHANGE,
+    TIMESTAMP, VIRTUAL_FIRST_CHANGE,
 };
 use crate::memory::VALUE_LIMBS;
 use crate::stark::Stark;
 use crate::witness::memory::MemoryOpKind::Read;
 use crate::witness::memory::{MemoryAddress, MemoryOp};
 
-pub fn ctl_data<F: Field>() -> Vec<Column<F>> {
+/// Creates the vector of `Columns` corresponding to:
+/// - the memory operation type,
+/// - the address in memory of the element being read/written,
+/// - the value being read/written,
+/// - the timestamp at which the element is read/written.
+pub(crate) fn ctl_data<F: Field>() -> Vec<Column<F>> {
     let mut res =
         Column::singles([IS_READ, ADDR_CONTEXT, ADDR_SEGMENT, ADDR_VIRTUAL]).collect_vec();
     res.extend(Column::singles((0..8).map(value_limb)));
@@ -35,12 +40,13 @@ pub fn ctl_data<F: Field>() -> Vec<Column<F>> {
     res
 }
 
-pub fn ctl_filter<F: Field>() -> Column<F> {
-    Column::single(FILTER)
+/// CTL filter for memory operations.
+pub(crate) fn ctl_filter<F: Field>() -> Filter<F> {
+    Filter::new_simple(Column::single(FILTER))
 }
 
 #[derive(Copy, Clone, Default)]
-pub struct MemoryStark<F, const D: usize> {
+pub(crate) struct MemoryStark<F, const D: usize> {
     pub(crate) f: PhantomData<F>,
 }
 
@@ -70,7 +76,9 @@ impl MemoryOp {
 }
 
 /// Generates the `_FIRST_CHANGE` columns and the `RANGE_CHECK` column in the trace.
-pub fn generate_first_change_flags_and_rc<F: RichField>(trace_rows: &mut [[F; NUM_COLUMNS]]) {
+pub(crate) fn generate_first_change_flags_and_rc<F: RichField>(
+    trace_rows: &mut [[F; NUM_COLUMNS]],
+) {
     let num_ops = trace_rows.len();
     for idx in 0..num_ops - 1 {
         let row = trace_rows[idx].as_slice();
@@ -84,6 +92,7 @@ pub fn generate_first_change_flags_and_rc<F: RichField>(trace_rows: &mut [[F; NU
         let next_segment = next_row[ADDR_SEGMENT];
         let next_virt = next_row[ADDR_VIRTUAL];
         let next_timestamp = next_row[TIMESTAMP];
+        let next_is_read = next_row[IS_READ];
 
         let context_changed = context != next_context;
         let segment_changed = segment != next_segment;
@@ -114,6 +123,10 @@ pub fn generate_first_change_flags_and_rc<F: RichField>(trace_rows: &mut [[F; NU
             "Range check of {} is too large. Bug in fill_gaps?",
             row[RANGE_CHECK]
         );
+
+        let address_changed =
+            row[CONTEXT_FIRST_CHANGE] + row[SEGMENT_FIRST_CHANGE] + row[VIRTUAL_FIRST_CHANGE];
+        row[INITIALIZE_AUX] = next_segment * address_changed * next_is_read;
     }
 }
 
@@ -145,8 +158,16 @@ impl<F: RichField + Extendable<D>, const D: usize> MemoryStark<F, D> {
         trace_col_vecs[COUNTER] = (0..height).map(|i| F::from_canonical_usize(i)).collect();
 
         for i in 0..height {
-            let x = trace_col_vecs[RANGE_CHECK][i].to_canonical_u64() as usize;
-            trace_col_vecs[FREQUENCIES][x] += F::ONE;
+            let x_rc = trace_col_vecs[RANGE_CHECK][i].to_canonical_u64() as usize;
+            trace_col_vecs[FREQUENCIES][x_rc] += F::ONE;
+            if (trace_col_vecs[CONTEXT_FIRST_CHANGE][i] == F::ONE)
+                || (trace_col_vecs[SEGMENT_FIRST_CHANGE][i] == F::ONE)
+            {
+                // CONTEXT_FIRST_CHANGE and SEGMENT_FIRST_CHANGE should be 0 at the last row, so the index
+                // should never be out of bounds.
+                let x_fo = trace_col_vecs[ADDR_VIRTUAL][i + 1].to_canonical_u64() as usize;
+                trace_col_vecs[FREQUENCIES][x_fo] += F::ONE;
+            }
         }
     }
 
@@ -162,7 +183,7 @@ impl<F: RichField + Extendable<D>, const D: usize> MemoryStark<F, D> {
     /// reads to the same address, say at timestamps 50 and 80.
     fn fill_gaps(memory_ops: &mut Vec<MemoryOp>) {
         let max_rc = memory_ops.len().next_power_of_two() - 1;
-        for (mut curr, next) in memory_ops.clone().into_iter().tuple_windows() {
+        for (mut curr, mut next) in memory_ops.clone().into_iter().tuple_windows() {
             if curr.address.context != next.address.context
                 || curr.address.segment != next.address.segment
             {
@@ -172,6 +193,15 @@ impl<F: RichField + Extendable<D>, const D: usize> MemoryStark<F, D> {
                 // Similarly, the number of possible segments is a small constant, so any gap must
                 // be small. max_rc will always be much larger, as just bootloading the kernel will
                 // trigger thousands of memory operations.
+                // However, we do check that the first address accessed is range-checkable. If not,
+                // we could start at a negative address and cheat.
+                while next.address.virt > max_rc {
+                    let mut dummy_address = next.address;
+                    dummy_address.virt -= max_rc;
+                    let dummy_read = MemoryOp::new_dummy_read(dummy_address, 0, U256::zero());
+                    memory_ops.push(dummy_read);
+                    next = dummy_read;
+                }
             } else if curr.address.virt != next.address.virt {
                 while next.address.virt - curr.address.virt - 1 > max_rc {
                     let mut dummy_address = curr.address;
@@ -274,6 +304,10 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for MemoryStark<F
         let filter = local_values[FILTER];
         yield_constr.constraint(filter * (filter - P::ONES));
 
+        // IS_READ must be 0 or 1.
+        // This is implied by the MemoryStark CTL, where corresponding values are either
+        // hardcoded to 0/1, or boolean-constrained in their respective STARK modules.
+
         // If this is a dummy row (filter is off), it must be a read. This means the prover can
         // insert reads which never appear in the CPU trace (which are harmless), but not writes.
         let is_dummy = P::ONES - filter;
@@ -317,12 +351,39 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for MemoryStark<F
             + address_unchanged * (next_timestamp - timestamp);
         yield_constr.constraint_transition(range_check - computed_range_check);
 
-        // Enumerate purportedly-ordered log.
+        // Validate initialize_aux. It contains next_segment * addr_changed * next_is_read.
+        let initialize_aux = local_values[INITIALIZE_AUX];
+        yield_constr.constraint_transition(
+            initialize_aux - next_addr_segment * not_address_unchanged * next_is_read,
+        );
+
         for i in 0..8 {
+            // Enumerate purportedly-ordered log.
             yield_constr.constraint_transition(
                 next_is_read * address_unchanged * (next_values_limbs[i] - value_limbs[i]),
             );
+            // By default, memory is initialized with 0. This means that if the first operation of a new address is a read,
+            // then its value must be 0.
+            // There are exceptions, though: this constraint zero-initializes everything but the code segment and context 0.
+            yield_constr
+                .constraint_transition(next_addr_context * initialize_aux * next_values_limbs[i]);
+            // We don't want to exclude the entirety of context 0. This constraint zero-initializes all segments except the
+            // specified ones (segment 0 is already included in initialize_aux).
+            // There is overlap with the previous constraint, but this is not a problem.
+            yield_constr.constraint_transition(
+                (next_addr_segment - P::Scalar::from_canonical_usize(Segment::TrieData.unscale()))
+                    * initialize_aux
+                    * next_values_limbs[i],
+            );
         }
+
+        // Check the range column: First value must be 0,
+        // and intermediate rows must increment by 1.
+        let rc1 = local_values[COUNTER];
+        let rc2 = next_values[COUNTER];
+        yield_constr.constraint_first_row(rc1);
+        let incr = rc2 - rc1;
+        yield_constr.constraint_transition(incr - P::Scalar::ONES);
     }
 
     fn eval_ext_circuit(
@@ -353,6 +414,10 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for MemoryStark<F
         let constraint = builder.mul_sub_extension(filter, filter, filter);
         yield_constr.constraint(builder, constraint);
 
+        // IS_READ must be 0 or 1.
+        // This is implied by the MemoryStark CTL, where corresponding values are either
+        // hardcoded to 0/1, or boolean-constrained in their respective STARK modules.
+
         // If this is a dummy row (filter is off), it must be a read. This means the prover can
         // insert reads which never appear in the CPU trace (which are harmless), but not writes.
         let is_dummy = builder.sub_extension(one, filter);
@@ -438,24 +503,70 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for MemoryStark<F
         let range_check_diff = builder.sub_extension(range_check, computed_range_check);
         yield_constr.constraint_transition(builder, range_check_diff);
 
-        // Enumerate purportedly-ordered log.
+        // Validate initialize_aux. It contains next_segment * addr_changed * next_is_read.
+        let initialize_aux = local_values[INITIALIZE_AUX];
+        let computed_initialize_aux = builder.mul_extension(not_address_unchanged, next_is_read);
+        let computed_initialize_aux =
+            builder.mul_extension(next_addr_segment, computed_initialize_aux);
+        let new_first_read_constraint =
+            builder.sub_extension(initialize_aux, computed_initialize_aux);
+        yield_constr.constraint_transition(builder, new_first_read_constraint);
+
         for i in 0..8 {
+            // Enumerate purportedly-ordered log.
             let value_diff = builder.sub_extension(next_values_limbs[i], value_limbs[i]);
             let zero_if_read = builder.mul_extension(address_unchanged, value_diff);
             let read_constraint = builder.mul_extension(next_is_read, zero_if_read);
             yield_constr.constraint_transition(builder, read_constraint);
+            // By default, memory is initialized with 0. This means that if the first operation of a new address is a read,
+            // then its value must be 0.
+            // There are exceptions, though: this constraint zero-initializes everything but the code segment and context 0.
+            let context_zero_initializing_constraint =
+                builder.mul_extension(next_values_limbs[i], initialize_aux);
+            let initializing_constraint =
+                builder.mul_extension(next_addr_context, context_zero_initializing_constraint);
+            yield_constr.constraint_transition(builder, initializing_constraint);
+            // We don't want to exclude the entirety of context 0. This constraint zero-initializes all segments except the
+            // specified ones (segment 0 is already included in initialize_aux).
+            // There is overlap with the previous constraint, but this is not a problem.
+            let segment_trie_data = builder.add_const_extension(
+                next_addr_segment,
+                F::NEG_ONE * F::from_canonical_usize(Segment::TrieData.unscale()),
+            );
+            let zero_init_constraint =
+                builder.mul_extension(segment_trie_data, context_zero_initializing_constraint);
+            yield_constr.constraint_transition(builder, zero_init_constraint);
         }
+
+        // Check the range column: First value must be 0,
+        // and intermediate rows must increment by 1.
+        let rc1 = local_values[COUNTER];
+        let rc2 = next_values[COUNTER];
+        yield_constr.constraint_first_row(builder, rc1);
+        let incr = builder.sub_extension(rc2, rc1);
+        let t = builder.sub_extension(incr, one);
+        yield_constr.constraint_transition(builder, t);
     }
 
     fn constraint_degree(&self) -> usize {
         3
     }
 
-    fn lookups(&self) -> Vec<Lookup> {
+    fn lookups(&self) -> Vec<Lookup<F>> {
         vec![Lookup {
-            columns: vec![RANGE_CHECK],
-            table_column: COUNTER,
-            frequencies_column: FREQUENCIES,
+            columns: vec![
+                Column::single(RANGE_CHECK),
+                Column::single_next_row(ADDR_VIRTUAL),
+            ],
+            table_column: Column::single(COUNTER),
+            frequencies_column: Column::single(FREQUENCIES),
+            filter_columns: vec![
+                None,
+                Some(Filter::new_simple(Column::sum([
+                    CONTEXT_FIRST_CHANGE,
+                    SEGMENT_FIRST_CHANGE,
+                ]))),
+            ],
         }]
     }
 }
diff --git a/evm/src/memory/mod.rs b/evm/src/memory/mod.rs
index 4cdfd1be5a..c61119530f 100644
--- a/evm/src/memory/mod.rs
+++ b/evm/src/memory/mod.rs
@@ -1,7 +1,13 @@
+//! The Memory STARK is used to handle all memory read and write operations happening when
+//! executing the EVM. Each non-dummy row of the table correspond to a single operation,
+//! and rows are ordered by the timestamp associated to each memory operation.
+
 pub mod columns;
 pub mod memory_stark;
 pub mod segments;
 
 // TODO: Move to CPU module, now that channels have been removed from the memory table.
 pub(crate) const NUM_CHANNELS: usize = crate::cpu::membus::NUM_CHANNELS;
+/// The number of limbs holding the value at a memory address.
+/// Eight limbs of 32 bits can hold a `U256`.
 pub(crate) const VALUE_LIMBS: usize = 8;
diff --git a/evm/src/memory/segments.rs b/evm/src/memory/segments.rs
index ede0ad5513..6fd601eb5e 100644
--- a/evm/src/memory/segments.rs
+++ b/evm/src/memory/segments.rs
@@ -1,83 +1,87 @@
+use ethereum_types::U256;
+
+pub(crate) const SEGMENT_SCALING_FACTOR: usize = 32;
+
+/// This contains all the existing memory segments. The values in the enum are shifted by 32 bits
+/// to allow for convenient address components (context / segment / virtual) bundling in the kernel.
 #[allow(dead_code)]
+#[allow(clippy::enum_clike_unportable_variant)]
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd, Debug)]
-pub enum Segment {
+pub(crate) enum Segment {
     /// Contains EVM bytecode.
+    // The Kernel has optimizations relying on the Code segment being 0.
+    // This shouldn't be changed!
     Code = 0,
     /// The program stack.
-    Stack = 1,
+    Stack = 1 << SEGMENT_SCALING_FACTOR,
     /// Main memory, owned by the contract code.
-    MainMemory = 2,
+    MainMemory = 2 << SEGMENT_SCALING_FACTOR,
     /// Data passed to the current context by its caller.
-    Calldata = 3,
+    Calldata = 3 << SEGMENT_SCALING_FACTOR,
     /// Data returned to the current context by its latest callee.
-    Returndata = 4,
+    Returndata = 4 << SEGMENT_SCALING_FACTOR,
     /// A segment which contains a few fixed-size metadata fields, such as the caller's context, or the
     /// size of `CALLDATA` and `RETURNDATA`.
-    GlobalMetadata = 5,
-    ContextMetadata = 6,
+    GlobalMetadata = 5 << SEGMENT_SCALING_FACTOR,
+    ContextMetadata = 6 << SEGMENT_SCALING_FACTOR,
     /// General purpose kernel memory, used by various kernel functions.
     /// In general, calling a helper function can result in this memory being clobbered.
-    KernelGeneral = 7,
+    KernelGeneral = 7 << SEGMENT_SCALING_FACTOR,
     /// Another segment for general purpose kernel use.
-    KernelGeneral2 = 8,
+    KernelGeneral2 = 8 << SEGMENT_SCALING_FACTOR,
     /// Segment to hold account code for opcodes like `CODESIZE, CODECOPY,...`.
-    KernelAccountCode = 9,
+    KernelAccountCode = 9 << SEGMENT_SCALING_FACTOR,
     /// Contains normalized transaction fields; see `NormalizedTxnField`.
-    TxnFields = 10,
+    TxnFields = 10 << SEGMENT_SCALING_FACTOR,
     /// Contains the data field of a transaction.
-    TxnData = 11,
+    TxnData = 11 << SEGMENT_SCALING_FACTOR,
     /// A buffer used to hold raw RLP data.
-    RlpRaw = 12,
+    RlpRaw = 12 << SEGMENT_SCALING_FACTOR,
     /// Contains all trie data. It is owned by the kernel, so it only lives on context 0.
-    TrieData = 13,
-    /// A buffer used to store the encodings of a branch node's children.
-    TrieEncodedChild = 14,
-    /// A buffer used to store the lengths of the encodings of a branch node's children.
-    TrieEncodedChildLen = 15,
-    /// A table of values 2^i for i=0..255 for use with shift
-    /// instructions; initialised by `kernel/asm/shift.asm::init_shift_table()`.
-    ShiftTable = 16,
-    JumpdestBits = 17,
-    EcdsaTable = 18,
-    BnWnafA = 19,
-    BnWnafB = 20,
-    BnTableQ = 21,
-    BnPairing = 22,
+    TrieData = 13 << SEGMENT_SCALING_FACTOR,
+    ShiftTable = 14 << SEGMENT_SCALING_FACTOR,
+    JumpdestBits = 15 << SEGMENT_SCALING_FACTOR,
+    EcdsaTable = 16 << SEGMENT_SCALING_FACTOR,
+    BnWnafA = 17 << SEGMENT_SCALING_FACTOR,
+    BnWnafB = 18 << SEGMENT_SCALING_FACTOR,
+    BnTableQ = 19 << SEGMENT_SCALING_FACTOR,
+    BnPairing = 20 << SEGMENT_SCALING_FACTOR,
     /// List of addresses that have been accessed in the current transaction.
-    AccessedAddresses = 23,
+    AccessedAddresses = 21 << SEGMENT_SCALING_FACTOR,
     /// List of storage keys that have been accessed in the current transaction.
-    AccessedStorageKeys = 24,
+    AccessedStorageKeys = 22 << SEGMENT_SCALING_FACTOR,
     /// List of addresses that have called SELFDESTRUCT in the current transaction.
-    SelfDestructList = 25,
+    SelfDestructList = 23 << SEGMENT_SCALING_FACTOR,
     /// Contains the bloom filter of a transaction.
-    TxnBloom = 26,
-    /// Contains the computed bloom filter of a block.
-    BlockBloom = 27,
-    /// Contains the final block bloom, and the block bloom filters before and after the current transaction.
-    /// The first eight elements are `block_metadata.block_bloom`. The next eight are `block_bloom_before`,
-    /// and the last eight are `block_bloom_after.
-    GlobalBlockBloom = 28,
+    TxnBloom = 24 << SEGMENT_SCALING_FACTOR,
+    /// Contains the bloom filter present in the block header.
+    GlobalBlockBloom = 25 << SEGMENT_SCALING_FACTOR,
     /// List of log pointers pointing to the LogsData segment.
-    Logs = 29,
-    LogsData = 30,
+    Logs = 26 << SEGMENT_SCALING_FACTOR,
+    LogsData = 27 << SEGMENT_SCALING_FACTOR,
     /// Journal of state changes. List of pointers to `JournalData`. Length in `GlobalMetadata`.
-    Journal = 31,
-    JournalData = 32,
-    JournalCheckpoints = 33,
+    Journal = 28 << SEGMENT_SCALING_FACTOR,
+    JournalData = 29 << SEGMENT_SCALING_FACTOR,
+    JournalCheckpoints = 30 << SEGMENT_SCALING_FACTOR,
     /// List of addresses that have been touched in the current transaction.
-    TouchedAddresses = 34,
+    TouchedAddresses = 31 << SEGMENT_SCALING_FACTOR,
     /// List of checkpoints for the current context. Length in `ContextMetadata`.
-    ContextCheckpoints = 35,
+    ContextCheckpoints = 32 << SEGMENT_SCALING_FACTOR,
     /// List of 256 previous block hashes.
-    BlockHashes = 36,
+    BlockHashes = 33 << SEGMENT_SCALING_FACTOR,
     /// List of contracts which have been created during the current transaction.
-    CreatedContracts = 37,
+    CreatedContracts = 34 << SEGMENT_SCALING_FACTOR,
 }
 
 impl Segment {
-    pub(crate) const COUNT: usize = 38;
+    pub(crate) const COUNT: usize = 35;
+
+    /// Unscales this segment by `SEGMENT_SCALING_FACTOR`.
+    pub(crate) const fn unscale(&self) -> usize {
+        *self as usize >> SEGMENT_SCALING_FACTOR
+    }
 
-    pub(crate) fn all() -> [Self; Self::COUNT] {
+    pub(crate) const fn all() -> [Self; Self::COUNT] {
         [
             Self::Code,
             Self::Stack,
@@ -93,8 +97,6 @@ impl Segment {
             Self::TxnData,
             Self::RlpRaw,
             Self::TrieData,
-            Self::TrieEncodedChild,
-            Self::TrieEncodedChildLen,
             Self::ShiftTable,
             Self::JumpdestBits,
             Self::EcdsaTable,
@@ -106,7 +108,6 @@ impl Segment {
             Self::AccessedStorageKeys,
             Self::SelfDestructList,
             Self::TxnBloom,
-            Self::BlockBloom,
             Self::GlobalBlockBloom,
             Self::Logs,
             Self::LogsData,
@@ -121,7 +122,7 @@ impl Segment {
     }
 
     /// The variable name that gets passed into kernel assembly code.
-    pub(crate) fn var_name(&self) -> &'static str {
+    pub(crate) const fn var_name(&self) -> &'static str {
         match self {
             Segment::Code => "SEGMENT_CODE",
             Segment::Stack => "SEGMENT_STACK",
@@ -137,20 +138,17 @@ impl Segment {
             Segment::TxnData => "SEGMENT_TXN_DATA",
             Segment::RlpRaw => "SEGMENT_RLP_RAW",
             Segment::TrieData => "SEGMENT_TRIE_DATA",
-            Segment::TrieEncodedChild => "SEGMENT_TRIE_ENCODED_CHILD",
-            Segment::TrieEncodedChildLen => "SEGMENT_TRIE_ENCODED_CHILD_LEN",
             Segment::ShiftTable => "SEGMENT_SHIFT_TABLE",
             Segment::JumpdestBits => "SEGMENT_JUMPDEST_BITS",
-            Segment::EcdsaTable => "SEGMENT_KERNEL_ECDSA_TABLE",
-            Segment::BnWnafA => "SEGMENT_KERNEL_BN_WNAF_A",
-            Segment::BnWnafB => "SEGMENT_KERNEL_BN_WNAF_B",
-            Segment::BnTableQ => "SEGMENT_KERNEL_BN_TABLE_Q",
-            Segment::BnPairing => "SEGMENT_KERNEL_BN_PAIRING",
+            Segment::EcdsaTable => "SEGMENT_ECDSA_TABLE",
+            Segment::BnWnafA => "SEGMENT_BN_WNAF_A",
+            Segment::BnWnafB => "SEGMENT_BN_WNAF_B",
+            Segment::BnTableQ => "SEGMENT_BN_TABLE_Q",
+            Segment::BnPairing => "SEGMENT_BN_PAIRING",
             Segment::AccessedAddresses => "SEGMENT_ACCESSED_ADDRESSES",
             Segment::AccessedStorageKeys => "SEGMENT_ACCESSED_STORAGE_KEYS",
             Segment::SelfDestructList => "SEGMENT_SELFDESTRUCT_LIST",
             Segment::TxnBloom => "SEGMENT_TXN_BLOOM",
-            Segment::BlockBloom => "SEGMENT_BLOCK_BLOOM",
             Segment::GlobalBlockBloom => "SEGMENT_GLOBAL_BLOCK_BLOOM",
             Segment::Logs => "SEGMENT_LOGS",
             Segment::LogsData => "SEGMENT_LOGS_DATA",
@@ -164,8 +162,7 @@ impl Segment {
         }
     }
 
-    #[allow(dead_code)]
-    pub(crate) fn bit_range(&self) -> usize {
+    pub(crate) const fn bit_range(&self) -> usize {
         match self {
             Segment::Code => 8,
             Segment::Stack => 256,
@@ -181,8 +178,6 @@ impl Segment {
             Segment::TxnData => 8,
             Segment::RlpRaw => 8,
             Segment::TrieData => 256,
-            Segment::TrieEncodedChild => 256,
-            Segment::TrieEncodedChildLen => 6,
             Segment::ShiftTable => 256,
             Segment::JumpdestBits => 1,
             Segment::EcdsaTable => 256,
@@ -195,7 +190,6 @@ impl Segment {
             Segment::SelfDestructList => 256,
             Segment::TxnBloom => 8,
             Segment::GlobalBlockBloom => 256,
-            Segment::BlockBloom => 8,
             Segment::Logs => 256,
             Segment::LogsData => 256,
             Segment::Journal => 256,
@@ -207,4 +201,17 @@ impl Segment {
             Segment::CreatedContracts => 256,
         }
     }
+
+    pub(crate) fn constant(&self, virt: usize) -> Option<U256> {
+        match self {
+            Segment::RlpRaw => {
+                if virt == 0xFFFFFFFF {
+                    Some(U256::from(0x80))
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        }
+    }
 }
diff --git a/evm/src/proof.rs b/evm/src/proof.rs
index 88ca27167b..8e36a90c64 100644
--- a/evm/src/proof.rs
+++ b/evm/src/proof.rs
@@ -19,43 +19,118 @@ use serde::{Deserialize, Serialize};
 use crate::all_stark::NUM_TABLES;
 use crate::config::StarkConfig;
 use crate::cross_table_lookup::GrandProductChallengeSet;
+use crate::util::{get_h160, get_h256, h2u};
 
 /// A STARK proof for each table, plus some metadata used to create recursive wrapper proofs.
 #[derive(Debug, Clone)]
 pub struct AllProof<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> {
+    /// Proofs for all the different STARK modules.
     pub stark_proofs: [StarkProofWithMetadata<F, C, D>; NUM_TABLES],
+    /// Cross-table lookup challenges.
     pub(crate) ctl_challenges: GrandProductChallengeSet<F>,
+    /// Public memory values used for the recursive proofs.
     pub public_values: PublicValues,
 }
 
 impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> AllProof<F, C, D> {
+    /// Returns the degree (i.e. the trace length) of each STARK.
     pub fn degree_bits(&self, config: &StarkConfig) -> [usize; NUM_TABLES] {
         core::array::from_fn(|i| self.stark_proofs[i].proof.recover_degree_bits(config))
     }
 }
 
+/// Randomness for all STARKs.
 pub(crate) struct AllProofChallenges<F: RichField + Extendable<D>, const D: usize> {
+    /// Randomness used in each STARK proof.
     pub stark_challenges: [StarkProofChallenges<F, D>; NUM_TABLES],
+    /// Randomness used for cross-table lookups. It is shared by all STARKs.
     pub ctl_challenges: GrandProductChallengeSet<F>,
 }
 
 /// Memory values which are public.
-#[derive(Debug, Clone, Default, Deserialize, Serialize)]
+#[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)]
 pub struct PublicValues {
+    /// Trie hashes before the execution of the local state transition
     pub trie_roots_before: TrieRoots,
+    /// Trie hashes after the execution of the local state transition.
     pub trie_roots_after: TrieRoots,
+    /// Block metadata: it remains unchanged within a block.
     pub block_metadata: BlockMetadata,
+    /// 256 previous block hashes and current block's hash.
     pub block_hashes: BlockHashes,
+    /// Extra block data that is specific to the current proof.
     pub extra_block_data: ExtraBlockData,
 }
 
-#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+impl PublicValues {
+    /// Extracts public values from the given public inputs of a proof.
+    /// Public values are always the first public inputs added to the circuit,
+    /// so we can start extracting at index 0.
+    pub fn from_public_inputs<F: RichField>(pis: &[F]) -> Self {
+        assert!(
+            pis.len()
+                > TrieRootsTarget::SIZE * 2
+                    + BlockMetadataTarget::SIZE
+                    + BlockHashesTarget::SIZE
+                    + ExtraBlockDataTarget::SIZE
+                    - 1
+        );
+
+        let trie_roots_before = TrieRoots::from_public_inputs(&pis[0..TrieRootsTarget::SIZE]);
+        let trie_roots_after =
+            TrieRoots::from_public_inputs(&pis[TrieRootsTarget::SIZE..TrieRootsTarget::SIZE * 2]);
+        let block_metadata = BlockMetadata::from_public_inputs(
+            &pis[TrieRootsTarget::SIZE * 2..TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE],
+        );
+        let block_hashes = BlockHashes::from_public_inputs(
+            &pis[TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE
+                ..TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE + BlockHashesTarget::SIZE],
+        );
+        let extra_block_data = ExtraBlockData::from_public_inputs(
+            &pis[TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE + BlockHashesTarget::SIZE
+                ..TrieRootsTarget::SIZE * 2
+                    + BlockMetadataTarget::SIZE
+                    + BlockHashesTarget::SIZE
+                    + ExtraBlockDataTarget::SIZE],
+        );
+
+        Self {
+            trie_roots_before,
+            trie_roots_after,
+            block_metadata,
+            block_hashes,
+            extra_block_data,
+        }
+    }
+}
+
+/// Trie hashes.
+#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
 pub struct TrieRoots {
+    /// State trie hash.
     pub state_root: H256,
+    /// Transaction trie hash.
     pub transactions_root: H256,
+    /// Receipts trie hash.
     pub receipts_root: H256,
 }
 
+impl TrieRoots {
+    pub fn from_public_inputs<F: RichField>(pis: &[F]) -> Self {
+        assert!(pis.len() == TrieRootsTarget::SIZE);
+
+        let state_root = get_h256(&pis[0..8]);
+        let transactions_root = get_h256(&pis[8..16]);
+        let receipts_root = get_h256(&pis[16..24]);
+
+        Self {
+            state_root,
+            transactions_root,
+            receipts_root,
+        }
+    }
+}
+
 // There should be 256 previous hashes stored, so the default should also contain 256 values.
 impl Default for BlockHashes {
     fn default() -> Self {
@@ -72,7 +147,7 @@ impl Default for BlockHashes {
 ///
 /// When the block number is less than 256, dummy values, i.e. `H256::default()`,
 /// should be used for the additional block hashes.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct BlockHashes {
     /// The previous 256 hashes to the current block. The leftmost hash, i.e. `prev_hashes[0]`,
     /// is the oldest, and the rightmost, i.e. `prev_hashes[255]` is the hash of the parent block.
@@ -81,31 +156,40 @@ pub struct BlockHashes {
     pub cur_hash: H256,
 }
 
-// TODO: Before going into production, `block_gas_used` and `block_gaslimit` here
-// as well as `gas_used_before` / `gas_used_after` in `ExtraBlockData` should be
-// updated to fit in a single 32-bit limb, as supporting 64-bit values for those
-// fields is only necessary for testing purposes.
+impl BlockHashes {
+    pub fn from_public_inputs<F: RichField>(pis: &[F]) -> Self {
+        assert!(pis.len() == BlockHashesTarget::SIZE);
+
+        let prev_hashes: [H256; 256] = core::array::from_fn(|i| get_h256(&pis[8 * i..8 + 8 * i]));
+        let cur_hash = get_h256(&pis[2048..2056]);
+
+        Self {
+            prev_hashes: prev_hashes.to_vec(),
+            cur_hash,
+        }
+    }
+}
+
 /// Metadata contained in a block header. Those are identical between
 /// all state transition proofs within the same block.
-#[derive(Debug, Clone, Default, Deserialize, Serialize)]
+#[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)]
 pub struct BlockMetadata {
     /// The address of this block's producer.
     pub block_beneficiary: Address,
-    /// The timestamp of this block. It must fit in a `u32`.
+    /// The timestamp of this block.
     pub block_timestamp: U256,
-    /// The index of this block. It must fit in a `u32`.
+    /// The index of this block.
     pub block_number: U256,
     /// The difficulty (before PoS transition) of this block.
     pub block_difficulty: U256,
-    /// The `mix_hash` value of this block.
     pub block_random: H256,
-    /// The gas limit of this block. It must fit in a `u64`.
+    /// The gas limit of this block. It must fit in a `u32`.
     pub block_gaslimit: U256,
-    /// The chain id of this block. It must fit in a `u32`.
+    /// The chain id of this block.
     pub block_chain_id: U256,
-    /// The base fee of this block. It must fit in a `u64`.
+    /// The base fee of this block.
     pub block_base_fee: U256,
-    /// The total gas used in this block. It must fit in a `u64`.
+    /// The total gas used in this block. It must fit in a `u32`.
     pub block_gas_used: U256,
     /// The blob base fee. It must fit in a `u64`.
     pub block_blob_base_fee: U256,
@@ -114,12 +198,47 @@ pub struct BlockMetadata {
     pub block_bloom: [U256; 8],
 }
 
+impl BlockMetadata {
+    pub fn from_public_inputs<F: RichField>(pis: &[F]) -> Self {
+        assert!(pis.len() == BlockMetadataTarget::SIZE);
+
+        let block_beneficiary = get_h160(&pis[0..5]);
+        let block_timestamp = pis[5].to_canonical_u64().into();
+        let block_number = pis[6].to_canonical_u64().into();
+        let block_difficulty = pis[7].to_canonical_u64().into();
+        let block_random = get_h256(&pis[8..16]);
+        let block_gaslimit = pis[16].to_canonical_u64().into();
+        let block_chain_id = pis[17].to_canonical_u64().into();
+        let block_base_fee =
+            (pis[18].to_canonical_u64() + (pis[19].to_canonical_u64() << 32)).into();
+        let block_gas_used = pis[20].to_canonical_u64().into();
+        let block_blob_base_fee =
+            (pis[21].to_canonical_u64() + (pis[22].to_canonical_u64() << 32)).into();
+        let block_bloom =
+            core::array::from_fn(|i| h2u(get_h256(&pis[23 + 8 * i..23 + 8 * (i + 1)])));
+
+        Self {
+            block_beneficiary,
+            block_timestamp,
+            block_number,
+            block_difficulty,
+            block_random,
+            block_gaslimit,
+            block_chain_id,
+            block_base_fee,
+            block_gas_used,
+            block_blob_base_fee,
+            block_bloom,
+        }
+    }
+}
+
 /// Additional block data that are specific to the local transaction being proven,
 /// unlike `BlockMetadata`.
-#[derive(Debug, Clone, Default, Deserialize, Serialize)]
+#[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)]
 pub struct ExtraBlockData {
-    /// The state trie digest of the genesis block.
-    pub genesis_state_trie_root: H256,
+    /// The state trie digest of the checkpoint block.
+    pub checkpoint_state_trie_root: H256,
     /// The transaction count prior execution of the local state transition, starting
     /// at 0 for the initial transaction of a block.
     pub txn_number_before: U256,
@@ -131,27 +250,47 @@ pub struct ExtraBlockData {
     /// The accumulated gas used after execution of the local state transition. It should
     /// match the `block_gas_used` value after execution of the last transaction in a block.
     pub gas_used_after: U256,
-    /// The accumulated bloom filter of this block prior execution of the local state transition,
-    /// starting with all zeros for the initial transaction of a block.
-    pub block_bloom_before: [U256; 8],
-    /// The accumulated bloom filter after execution of the local state transition. It should
-    /// match the `block_bloom` value after execution of the last transaction in a block.
-    pub block_bloom_after: [U256; 8],
+}
+
+impl ExtraBlockData {
+    pub fn from_public_inputs<F: RichField>(pis: &[F]) -> Self {
+        assert!(pis.len() == ExtraBlockDataTarget::SIZE);
+
+        let checkpoint_state_trie_root = get_h256(&pis[0..8]);
+        let txn_number_before = pis[8].to_canonical_u64().into();
+        let txn_number_after = pis[9].to_canonical_u64().into();
+        let gas_used_before = pis[10].to_canonical_u64().into();
+        let gas_used_after = pis[11].to_canonical_u64().into();
+
+        Self {
+            checkpoint_state_trie_root,
+            txn_number_before,
+            txn_number_after,
+            gas_used_before,
+            gas_used_after,
+        }
+    }
 }
 
 /// Memory values which are public.
 /// Note: All the larger integers are encoded with 32-bit limbs in little-endian order.
 #[derive(Eq, PartialEq, Debug)]
 pub struct PublicValuesTarget {
+    /// Trie hashes before the execution of the local state transition.
     pub trie_roots_before: TrieRootsTarget,
+    /// Trie hashes after the execution of the local state transition.
     pub trie_roots_after: TrieRootsTarget,
+    /// Block metadata: it remains unchanged within a block.
     pub block_metadata: BlockMetadataTarget,
+    /// 256 previous block hashes and current block's hash.
     pub block_hashes: BlockHashesTarget,
+    /// Extra block data that is specific to the current proof.
     pub extra_block_data: ExtraBlockDataTarget,
 }
 
 impl PublicValuesTarget {
-    pub fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
+    /// Serializes public value targets.
+    pub(crate) fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
         let TrieRootsTarget {
             state_root: state_root_before,
             transactions_root: transactions_root_before,
@@ -191,10 +330,10 @@ impl PublicValuesTarget {
         buffer.write_target(block_number)?;
         buffer.write_target(block_difficulty)?;
         buffer.write_target_array(&block_random)?;
-        buffer.write_target_array(&block_gaslimit)?;
+        buffer.write_target(block_gaslimit)?;
         buffer.write_target(block_chain_id)?;
         buffer.write_target_array(&block_base_fee)?;
-        buffer.write_target_array(&block_gas_used)?;
+        buffer.write_target(block_gas_used)?;
         buffer.write_target_array(&block_blob_base_fee)?;
         buffer.write_target_array(&block_bloom)?;
 
@@ -206,26 +345,23 @@ impl PublicValuesTarget {
         buffer.write_target_array(&cur_hash)?;
 
         let ExtraBlockDataTarget {
-            genesis_state_trie_root: genesis_state_root,
+            checkpoint_state_trie_root,
             txn_number_before,
             txn_number_after,
             gas_used_before,
             gas_used_after,
-            block_bloom_before,
-            block_bloom_after,
         } = self.extra_block_data;
-        buffer.write_target_array(&genesis_state_root)?;
+        buffer.write_target_array(&checkpoint_state_trie_root)?;
         buffer.write_target(txn_number_before)?;
         buffer.write_target(txn_number_after)?;
-        buffer.write_target_array(&gas_used_before)?;
-        buffer.write_target_array(&gas_used_after)?;
-        buffer.write_target_array(&block_bloom_before)?;
-        buffer.write_target_array(&block_bloom_after)?;
+        buffer.write_target(gas_used_before)?;
+        buffer.write_target(gas_used_after)?;
 
         Ok(())
     }
 
-    pub fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
+    /// Deserializes public value targets.
+    pub(crate) fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
         let trie_roots_before = TrieRootsTarget {
             state_root: buffer.read_target_array()?,
             transactions_root: buffer.read_target_array()?,
@@ -244,10 +380,10 @@ impl PublicValuesTarget {
             block_number: buffer.read_target()?,
             block_difficulty: buffer.read_target()?,
             block_random: buffer.read_target_array()?,
-            block_gaslimit: buffer.read_target_array()?,
+            block_gaslimit: buffer.read_target()?,
             block_chain_id: buffer.read_target()?,
             block_base_fee: buffer.read_target_array()?,
-            block_gas_used: buffer.read_target_array()?,
+            block_gas_used: buffer.read_target()?,
             block_blob_base_fee: buffer.read_target_array()?,
             block_bloom: buffer.read_target_array()?,
         };
@@ -258,13 +394,11 @@ impl PublicValuesTarget {
         };
 
         let extra_block_data = ExtraBlockDataTarget {
-            genesis_state_trie_root: buffer.read_target_array()?,
+            checkpoint_state_trie_root: buffer.read_target_array()?,
             txn_number_before: buffer.read_target()?,
             txn_number_after: buffer.read_target()?,
-            gas_used_before: buffer.read_target_array()?,
-            gas_used_after: buffer.read_target_array()?,
-            block_bloom_before: buffer.read_target_array()?,
-            block_bloom_after: buffer.read_target_array()?,
+            gas_used_before: buffer.read_target()?,
+            gas_used_after: buffer.read_target()?,
         };
 
         Ok(Self {
@@ -276,12 +410,15 @@ impl PublicValuesTarget {
         })
     }
 
-    pub fn from_public_inputs(pis: &[Target]) -> Self {
+    /// Extracts public value `Target`s from the given public input `Target`s.
+    /// Public values are always the first public inputs added to the circuit,
+    /// so we can start extracting at index 0.
+    pub(crate) fn from_public_inputs(pis: &[Target]) -> Self {
         assert!(
             pis.len()
                 > TrieRootsTarget::SIZE * 2
                     + BlockMetadataTarget::SIZE
-                    + BlockHashesTarget::BLOCK_HASHES_SIZE
+                    + BlockHashesTarget::SIZE
                     + ExtraBlockDataTarget::SIZE
                     - 1
         );
@@ -299,21 +436,20 @@ impl PublicValuesTarget {
                 &pis[TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE
                     ..TrieRootsTarget::SIZE * 2
                         + BlockMetadataTarget::SIZE
-                        + BlockHashesTarget::BLOCK_HASHES_SIZE],
+                        + BlockHashesTarget::SIZE],
             ),
             extra_block_data: ExtraBlockDataTarget::from_public_inputs(
-                &pis[TrieRootsTarget::SIZE * 2
-                    + BlockMetadataTarget::SIZE
-                    + BlockHashesTarget::BLOCK_HASHES_SIZE
+                &pis[TrieRootsTarget::SIZE * 2 + BlockMetadataTarget::SIZE + BlockHashesTarget::SIZE
                     ..TrieRootsTarget::SIZE * 2
                         + BlockMetadataTarget::SIZE
-                        + BlockHashesTarget::BLOCK_HASHES_SIZE
+                        + BlockHashesTarget::SIZE
                         + ExtraBlockDataTarget::SIZE],
             ),
         }
     }
 
-    pub fn select<F: RichField + Extendable<D>, const D: usize>(
+    /// Returns the public values in `pv0` or `pv1` depening on `condition`.
+    pub(crate) fn select<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         condition: BoolTarget,
         pv0: Self,
@@ -354,17 +490,26 @@ impl PublicValuesTarget {
     }
 }
 
+/// Circuit version of `TrieRoots`.
+/// `Target`s for trie hashes. Since a `Target` holds a 32-bit limb, each hash requires 8 `Target`s.
 #[derive(Eq, PartialEq, Debug, Copy, Clone)]
 pub struct TrieRootsTarget {
-    pub state_root: [Target; 8],
-    pub transactions_root: [Target; 8],
-    pub receipts_root: [Target; 8],
+    /// Targets for the state trie hash.
+    pub(crate) state_root: [Target; 8],
+    /// Targets for the transactions trie hash.
+    pub(crate) transactions_root: [Target; 8],
+    /// Targets for the receipts trie hash.
+    pub(crate) receipts_root: [Target; 8],
 }
 
 impl TrieRootsTarget {
-    pub const SIZE: usize = 24;
+    /// Number of `Target`s required for all trie hashes.
+    pub(crate) const HASH_SIZE: usize = 8;
+    pub(crate) const SIZE: usize = Self::HASH_SIZE * 3;
 
-    pub fn from_public_inputs(pis: &[Target]) -> Self {
+    /// Extracts trie hash `Target`s for all tries from the provided public input `Target`s.
+    /// The provided `pis` should start with the trie hashes.
+    pub(crate) fn from_public_inputs(pis: &[Target]) -> Self {
         let state_root = pis[0..8].try_into().unwrap();
         let transactions_root = pis[8..16].try_into().unwrap();
         let receipts_root = pis[16..24].try_into().unwrap();
@@ -376,7 +521,9 @@ impl TrieRootsTarget {
         }
     }
 
-    pub fn select<F: RichField + Extendable<D>, const D: usize>(
+    /// If `condition`, returns the trie hashes in `tr0`,
+    /// otherwise returns the trie hashes in `tr1`.
+    pub(crate) fn select<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         condition: BoolTarget,
         tr0: Self,
@@ -399,7 +546,8 @@ impl TrieRootsTarget {
         }
     }
 
-    pub fn connect<F: RichField + Extendable<D>, const D: usize>(
+    /// Connects the trie hashes in `tr0` and in `tr1`.
+    pub(crate) fn connect<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         tr0: Self,
         tr1: Self,
@@ -412,36 +560,53 @@ impl TrieRootsTarget {
     }
 }
 
+/// Circuit version of `BlockMetadata`.
+/// Metadata contained in a block header. Those are identical between
+/// all state transition proofs within the same block.
 #[derive(Eq, PartialEq, Debug, Copy, Clone)]
 pub struct BlockMetadataTarget {
-    pub block_beneficiary: [Target; 5],
-    pub block_timestamp: Target,
-    pub block_number: Target,
-    pub block_difficulty: Target,
-    pub block_random: [Target; 8],
-    pub block_gaslimit: [Target; 2],
-    pub block_chain_id: Target,
-    pub block_base_fee: [Target; 2],
-    pub block_gas_used: [Target; 2],
-    pub block_blob_base_fee: [Target; 2],
-    pub block_bloom: [Target; 64],
+    /// `Target`s for the address of this block's producer.
+    pub(crate) block_beneficiary: [Target; 5],
+    /// `Target` for the timestamp of this block.
+    pub(crate) block_timestamp: Target,
+    /// `Target` for the index of this block.
+    pub(crate) block_number: Target,
+    /// `Target` for the difficulty (before PoS transition) of this block.
+    pub(crate) block_difficulty: Target,
+    /// `Target`s for the `mix_hash` value of this block.
+    pub(crate) block_random: [Target; 8],
+    /// `Target` for the gas limit of this block.
+    pub(crate) block_gaslimit: Target,
+    /// `Target` for the chain id of this block.
+    pub(crate) block_chain_id: Target,
+    /// `Target`s for the base fee of this block.
+    pub(crate) block_base_fee: [Target; 2],
+    /// `Target` for the gas used of this block.
+    pub(crate) block_gas_used: Target,
+    /// `Target`s for the blob base fee of this block.
+    pub(crate) block_blob_base_fee: [Target; 2],
+    /// `Target`s for the block bloom of this block.
+    pub(crate) block_bloom: [Target; 64],
 }
 
 impl BlockMetadataTarget {
-    pub const SIZE: usize = 89;
+    /// Number of `Target`s required for the block metadata.
+    pub(crate) const SIZE: usize = 87;
 
-    pub fn from_public_inputs(pis: &[Target]) -> Self {
+    /// Extracts block metadata `Target`s from the provided public input `Target`s.
+    /// The provided `pis` should start with the block metadata.
+    pub(crate) fn from_public_inputs(pis: &[Target]) -> Self {
         let block_beneficiary = pis[0..5].try_into().unwrap();
         let block_timestamp = pis[5];
         let block_number = pis[6];
         let block_difficulty = pis[7];
         let block_random = pis[8..16].try_into().unwrap();
-        let block_gaslimit = pis[16..18].try_into().unwrap();
-        let block_chain_id = pis[18];
-        let block_base_fee = pis[19..21].try_into().unwrap();
-        let block_gas_used = pis[21..23].try_into().unwrap();
-        let block_blob_base_fee = pis[23..25].try_into().unwrap();
-        let block_bloom = pis[25..89].try_into().unwrap();
+        let block_gaslimit = pis[16];
+        let block_chain_id = pis[17];
+        let block_base_fee = pis[18..20].try_into().unwrap();
+        let block_gas_used = pis[20];
+        let block_blob_base_fee = pis[21..23].try_into().unwrap();
+        let block_bloom = pis[23..87].try_into().unwrap();
 
         Self {
             block_beneficiary,
@@ -458,7 +623,9 @@ impl BlockMetadataTarget {
         }
     }
 
-    pub fn select<F: RichField + Extendable<D>, const D: usize>(
+    /// If `condition`, returns the block metadata in `bm0`,
+    /// otherwise returns the block metadata in `bm1`.
+    pub(crate) fn select<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         condition: BoolTarget,
         bm0: Self,
@@ -478,16 +645,12 @@ impl BlockMetadataTarget {
             block_random: core::array::from_fn(|i| {
                 builder.select(condition, bm0.block_random[i], bm1.block_random[i])
             }),
-            block_gaslimit: core::array::from_fn(|i| {
-                builder.select(condition, bm0.block_gaslimit[i], bm1.block_gaslimit[i])
-            }),
+            block_gaslimit: builder.select(condition, bm0.block_gaslimit, bm1.block_gaslimit),
             block_chain_id: builder.select(condition, bm0.block_chain_id, bm1.block_chain_id),
             block_base_fee: core::array::from_fn(|i| {
                 builder.select(condition, bm0.block_base_fee[i], bm1.block_base_fee[i])
             }),
-            block_gas_used: core::array::from_fn(|i| {
-                builder.select(condition, bm0.block_gas_used[i], bm1.block_gas_used[i])
-            }),
+            block_gas_used: builder.select(condition, bm0.block_gas_used, bm1.block_gas_used),
             block_blob_base_fee: core::array::from_fn(|i| {
                 builder.select(
                     condition,
@@ -501,7 +664,8 @@ impl BlockMetadataTarget {
         }
     }
 
-    pub fn connect<F: RichField + Extendable<D>, const D: usize>(
+    /// Connects the block metadata in `bm0` to the block metadata in `bm1`.
+    pub(crate) fn connect<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         bm0: Self,
         bm1: Self,
@@ -515,16 +679,12 @@ impl BlockMetadataTarget {
         for i in 0..8 {
             builder.connect(bm0.block_random[i], bm1.block_random[i]);
         }
-        for i in 0..2 {
-            builder.connect(bm0.block_gaslimit[i], bm1.block_gaslimit[i])
-        }
+        builder.connect(bm0.block_gaslimit, bm1.block_gaslimit);
         builder.connect(bm0.block_chain_id, bm1.block_chain_id);
         for i in 0..2 {
             builder.connect(bm0.block_base_fee[i], bm1.block_base_fee[i])
         }
-        for i in 0..2 {
-            builder.connect(bm0.block_gas_used[i], bm1.block_gas_used[i])
-        }
+        builder.connect(bm0.block_gas_used, bm1.block_gas_used);
         for i in 0..2 {
             builder.connect(bm0.block_blob_base_fee[i], bm1.block_blob_base_fee[i])
         }
@@ -534,22 +694,39 @@ impl BlockMetadataTarget {
     }
 }
 
+/// Circuit version of `BlockHashes`.
+/// `Target`s for the user-provided previous 256 block hashes and current block hash.
+/// Each block hash requires 8 `Target`s.
+/// The proofs across consecutive blocks ensure that these values
+/// are consistent (i.e. shifted by eight `Target`s to the left).
+///
+/// When the block number is less than 256, dummy values, i.e. `H256::default()`,
+/// should be used for the additional block hashes.
 #[derive(Eq, PartialEq, Debug, Copy, Clone)]
 pub struct BlockHashesTarget {
-    pub prev_hashes: [Target; 2048],
-    pub cur_hash: [Target; 8],
+    /// `Target`s for the previous 256 hashes to the current block. The leftmost hash, i.e. `prev_hashes[0..8]`,
+    /// is the oldest, and the rightmost, i.e. `prev_hashes[255 * 7..255 * 8]` is the hash of the parent block.
+    pub(crate) prev_hashes: [Target; 2048],
+    // `Target` for the hash of the current block.
+    pub(crate) cur_hash: [Target; 8],
 }
 
 impl BlockHashesTarget {
-    pub const BLOCK_HASHES_SIZE: usize = 2056;
-    pub fn from_public_inputs(pis: &[Target]) -> Self {
+    /// Number of `Target`s required for previous and current block hashes.
+    pub(crate) const SIZE: usize = 2056;
+
+    /// Extracts the previous and current block hash `Target`s from the public input `Target`s.
+    /// The provided `pis` should start with the block hashes.
+    pub(crate) fn from_public_inputs(pis: &[Target]) -> Self {
         Self {
             prev_hashes: pis[0..2048].try_into().unwrap(),
             cur_hash: pis[2048..2056].try_into().unwrap(),
         }
     }
 
-    pub fn select<F: RichField + Extendable<D>, const D: usize>(
+    /// If `condition`, returns the block hashes in `bm0`,
+    /// otherwise returns the block hashes in `bm1`.
+    pub(crate) fn select<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         condition: BoolTarget,
         bm0: Self,
@@ -565,7 +742,8 @@ impl BlockHashesTarget {
         }
     }
 
-    pub fn connect<F: RichField + Extendable<D>, const D: usize>(
+    /// Connects the block hashes in `bm0` to the block hashes in `bm1`.
+    pub(crate) fn connect<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         bm0: Self,
         bm1: Self,
@@ -579,52 +757,62 @@ impl BlockHashesTarget {
     }
 }
 
+/// Circuit version of `ExtraBlockData`.
+/// Additional block data that are specific to the local transaction being proven,
+/// unlike `BlockMetadata`.
 #[derive(Eq, PartialEq, Debug, Copy, Clone)]
 pub struct ExtraBlockDataTarget {
-    pub genesis_state_trie_root: [Target; 8],
+    /// `Target`s for the state trie digest of the checkpoint block.
+    pub checkpoint_state_trie_root: [Target; 8],
+    /// `Target` for the transaction count prior execution of the local state transition, starting
+    /// at 0 for the initial trnasaction of a block.
     pub txn_number_before: Target,
+    /// `Target` for the transaction count after execution of the local state transition.
     pub txn_number_after: Target,
-    pub gas_used_before: [Target; 2],
-    pub gas_used_after: [Target; 2],
-    pub block_bloom_before: [Target; 64],
-    pub block_bloom_after: [Target; 64],
+    /// `Target` for the accumulated gas used prior execution of the local state transition, starting
+    /// at 0 for the initial transaction of a block.
+    pub gas_used_before: Target,
+    /// `Target` for the accumulated gas used after execution of the local state transition. It should
+    /// match the `block_gas_used` value after execution of the last transaction in a block.
+    pub gas_used_after: Target,
 }
 
 impl ExtraBlockDataTarget {
-    const SIZE: usize = 142;
+    /// Number of `Target`s required for the extra block data.
+    const SIZE: usize = 12;
 
-    pub fn from_public_inputs(pis: &[Target]) -> Self {
-        let genesis_state_trie_root = pis[0..8].try_into().unwrap();
+    /// Extracts the extra block data `Target`s from the public input `Target`s.
+    /// The provided `pis` should start with the extra vblock data.
+    pub(crate) fn from_public_inputs(pis: &[Target]) -> Self {
+        let checkpoint_state_trie_root = pis[0..8].try_into().unwrap();
         let txn_number_before = pis[8];
         let txn_number_after = pis[9];
-        let gas_used_before = pis[10..12].try_into().unwrap();
-        let gas_used_after = pis[12..14].try_into().unwrap();
-        let block_bloom_before = pis[14..78].try_into().unwrap();
-        let block_bloom_after = pis[78..142].try_into().unwrap();
+        let gas_used_before = pis[10];
+        let gas_used_after = pis[11];
 
         Self {
-            genesis_state_trie_root,
+            checkpoint_state_trie_root,
             txn_number_before,
             txn_number_after,
             gas_used_before,
             gas_used_after,
-            block_bloom_before,
-            block_bloom_after,
         }
     }
 
-    pub fn select<F: RichField + Extendable<D>, const D: usize>(
+    /// If `condition`, returns the extra block data in `ed0`,
+    /// otherwise returns the extra block data in `ed1`.
+    pub(crate) fn select<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         condition: BoolTarget,
         ed0: Self,
         ed1: Self,
     ) -> Self {
         Self {
-            genesis_state_trie_root: core::array::from_fn(|i| {
+            checkpoint_state_trie_root: core::array::from_fn(|i| {
                 builder.select(
                     condition,
-                    ed0.genesis_state_trie_root[i],
-                    ed1.genesis_state_trie_root[i],
+                    ed0.checkpoint_state_trie_root[i],
+                    ed1.checkpoint_state_trie_root[i],
                 )
             }),
             txn_number_before: builder.select(
@@ -633,57 +821,31 @@ impl ExtraBlockDataTarget {
                 ed1.txn_number_before,
             ),
             txn_number_after: builder.select(condition, ed0.txn_number_after, ed1.txn_number_after),
-            gas_used_before: core::array::from_fn(|i| {
-                builder.select(condition, ed0.gas_used_before[i], ed1.gas_used_before[i])
-            }),
-            gas_used_after: core::array::from_fn(|i| {
-                builder.select(condition, ed0.gas_used_after[i], ed1.gas_used_after[i])
-            }),
-            block_bloom_before: core::array::from_fn(|i| {
-                builder.select(
-                    condition,
-                    ed0.block_bloom_before[i],
-                    ed1.block_bloom_before[i],
-                )
-            }),
-            block_bloom_after: core::array::from_fn(|i| {
-                builder.select(
-                    condition,
-                    ed0.block_bloom_after[i],
-                    ed1.block_bloom_after[i],
-                )
-            }),
+            gas_used_before: builder.select(condition, ed0.gas_used_before, ed1.gas_used_before),
+            gas_used_after: builder.select(condition, ed0.gas_used_after, ed1.gas_used_after),
         }
     }
 
-    pub fn connect<F: RichField + Extendable<D>, const D: usize>(
+    /// Connects the extra block data in `ed0` with the extra block data in `ed1`.
+    pub(crate) fn connect<F: RichField + Extendable<D>, const D: usize>(
         builder: &mut CircuitBuilder<F, D>,
         ed0: Self,
         ed1: Self,
     ) {
         for i in 0..8 {
             builder.connect(
-                ed0.genesis_state_trie_root[i],
-                ed1.genesis_state_trie_root[i],
+                ed0.checkpoint_state_trie_root[i],
+                ed1.checkpoint_state_trie_root[i],
             );
         }
         builder.connect(ed0.txn_number_before, ed1.txn_number_before);
         builder.connect(ed0.txn_number_after, ed1.txn_number_after);
-        for i in 0..2 {
-            builder.connect(ed0.gas_used_before[i], ed1.gas_used_before[i]);
-        }
-        for i in 0..2 {
-            builder.connect(ed1.gas_used_after[i], ed1.gas_used_after[i]);
-        }
-        for i in 0..64 {
-            builder.connect(ed0.block_bloom_before[i], ed1.block_bloom_before[i]);
-        }
-        for i in 0..64 {
-            builder.connect(ed0.block_bloom_after[i], ed1.block_bloom_after[i]);
-        }
+        builder.connect(ed0.gas_used_before, ed1.gas_used_before);
+        builder.connect(ed0.gas_used_after, ed1.gas_used_after);
     }
 }
 
+/// Merkle caps and openings that form the proof of a single STARK.
 #[derive(Debug, Clone)]
 pub struct StarkProof<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> {
     /// Merkle cap of LDEs of trace values.
@@ -706,7 +868,9 @@ where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
 {
+    /// Initial Fiat-Shamir state.
     pub(crate) init_challenger_state: <C::Hasher as Hasher<F>>::Permutation,
+    /// Proof for a single STARK.
     pub(crate) proof: StarkProof<F, C, D>,
 }
 
@@ -721,22 +885,31 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> S
         lde_bits - config.fri_config.rate_bits
     }
 
+    /// Returns the number of cross-table lookup polynomials computed for the current STARK.
     pub fn num_ctl_zs(&self) -> usize {
         self.openings.ctl_zs_first.len()
     }
 }
 
+/// Circuit version of `StarkProof`.
+/// Merkle caps and openings that form the proof of a single STARK.
 #[derive(Eq, PartialEq, Debug)]
-pub struct StarkProofTarget<const D: usize> {
+pub(crate) struct StarkProofTarget<const D: usize> {
+    /// `Target` for the Merkle cap if LDEs of trace values.
     pub trace_cap: MerkleCapTarget,
+    /// `Target` for the Merkle cap of LDEs of lookup helper and CTL columns.
     pub auxiliary_polys_cap: MerkleCapTarget,
+    /// `Target` for the Merkle cap of LDEs of quotient polynomial evaluations.
     pub quotient_polys_cap: MerkleCapTarget,
+    /// `Target`s for the purported values of each polynomial at the challenge point.
     pub openings: StarkOpeningSetTarget<D>,
+    /// `Target`s for the batch FRI argument for all openings.
     pub opening_proof: FriProofTarget<D>,
 }
 
 impl<const D: usize> StarkProofTarget<D> {
-    pub fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
+    /// Serializes a STARK proof.
+    pub(crate) fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
         buffer.write_target_merkle_cap(&self.trace_cap)?;
         buffer.write_target_merkle_cap(&self.auxiliary_polys_cap)?;
         buffer.write_target_merkle_cap(&self.quotient_polys_cap)?;
@@ -745,7 +918,8 @@ impl<const D: usize> StarkProofTarget<D> {
         Ok(())
     }
 
-    pub fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
+    /// Deserializes a STARK proof.
+    pub(crate) fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
         let trace_cap = buffer.read_target_merkle_cap()?;
         let auxiliary_polys_cap = buffer.read_target_merkle_cap()?;
         let quotient_polys_cap = buffer.read_target_merkle_cap()?;
@@ -762,7 +936,7 @@ impl<const D: usize> StarkProofTarget<D> {
     }
 
     /// Recover the length of the trace from a STARK proof and a STARK config.
-    pub fn recover_degree_bits(&self, config: &StarkConfig) -> usize {
+    pub(crate) fn recover_degree_bits(&self, config: &StarkConfig) -> usize {
         let initial_merkle_proof = &self.opening_proof.query_round_proofs[0]
             .initial_trees_proof
             .evals_proofs[0]
@@ -772,6 +946,7 @@ impl<const D: usize> StarkProofTarget<D> {
     }
 }
 
+/// Randomness used for a STARK proof.
 pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: usize> {
     /// Random values used to combine STARK constraints.
     pub stark_alphas: Vec<F>,
@@ -779,12 +954,17 @@ pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: us
     /// Point at which the STARK polynomials are opened.
     pub stark_zeta: F::Extension,
 
+    /// Randomness used in FRI.
     pub fri_challenges: FriChallenges<F, D>,
 }
 
+/// Circuit version of `StarkProofChallenges`.
 pub(crate) struct StarkProofChallengesTarget<const D: usize> {
+    /// `Target`s for the random values used to combine STARK constraints.
     pub stark_alphas: Vec<Target>,
+    /// `ExtensionTarget` for the point at which the STARK polynomials are opened.
     pub stark_zeta: ExtensionTarget<D>,
+    /// `Target`s for the randomness used in FRI.
     pub fri_challenges: FriChallengesTarget<D>,
 }
 
@@ -806,6 +986,9 @@ pub struct StarkOpeningSet<F: RichField + Extendable<D>, const D: usize> {
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
+    /// Returns a `StarkOpeningSet` given all the polynomial commitments, the number of permutation `Z`polynomials,
+    /// the evaluation point and a generator `g`.
+    /// Polynomials are evaluated at point `zeta` and, if necessary, at `g * zeta`.
     pub fn new<C: GenericConfig<D, F = F>>(
         zeta: F::Extension,
         g: F,
@@ -813,32 +996,41 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
         auxiliary_polys_commitment: &PolynomialBatch<F, C, D>,
         quotient_commitment: &PolynomialBatch<F, C, D>,
         num_lookup_columns: usize,
+        num_ctl_polys: &[usize],
     ) -> Self {
+        let total_num_helper_cols: usize = num_ctl_polys.iter().sum();
+
+        // Batch evaluates polynomials on the LDE, at a point `z`.
         let eval_commitment = |z: F::Extension, c: &PolynomialBatch<F, C, D>| {
             c.polynomials
                 .par_iter()
                 .map(|p| p.to_extension().eval(z))
                 .collect::<Vec<_>>()
         };
+        // Batch evaluates polynomials at a base field point `z`.
         let eval_commitment_base = |z: F, c: &PolynomialBatch<F, C, D>| {
             c.polynomials
                 .par_iter()
                 .map(|p| p.eval(z))
                 .collect::<Vec<_>>()
         };
+
+        let auxiliary_first = eval_commitment_base(F::ONE, auxiliary_polys_commitment);
+        let ctl_zs_first = auxiliary_first[num_lookup_columns + total_num_helper_cols..].to_vec();
+        // `g * zeta`.
         let zeta_next = zeta.scalar_mul(g);
         Self {
             local_values: eval_commitment(zeta, trace_commitment),
             next_values: eval_commitment(zeta_next, trace_commitment),
             auxiliary_polys: eval_commitment(zeta, auxiliary_polys_commitment),
             auxiliary_polys_next: eval_commitment(zeta_next, auxiliary_polys_commitment),
-            ctl_zs_first: eval_commitment_base(F::ONE, auxiliary_polys_commitment)
-                [num_lookup_columns..]
-                .to_vec(),
+            ctl_zs_first,
             quotient_polys: eval_commitment(zeta, quotient_commitment),
         }
     }
 
+    /// Constructs the openings required by FRI.
+    /// All openings but `ctl_zs_first` are grouped together.
     pub(crate) fn to_fri_openings(&self) -> FriOpenings<F, D> {
         let zeta_batch = FriOpeningBatch {
             values: self
@@ -873,18 +1065,27 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
     }
 }
 
+/// Circuit version of `StarkOpeningSet`.
+/// `Target`s for the purported values of each polynomial at the challenge point.
 #[derive(Eq, PartialEq, Debug)]
-pub struct StarkOpeningSetTarget<const D: usize> {
+pub(crate) struct StarkOpeningSetTarget<const D: usize> {
+    /// `ExtensionTarget`s for the openings of trace polynomials at `zeta`.
     pub local_values: Vec<ExtensionTarget<D>>,
+    /// `ExtensionTarget`s for the opening of trace polynomials at `g * zeta`.
     pub next_values: Vec<ExtensionTarget<D>>,
+    /// `ExtensionTarget`s for the opening of lookups and cross-table lookups `Z` polynomials at `zeta`.
     pub auxiliary_polys: Vec<ExtensionTarget<D>>,
+    /// `ExtensionTarget`s for the opening of lookups and cross-table lookups `Z` polynomials at `g * zeta`.
     pub auxiliary_polys_next: Vec<ExtensionTarget<D>>,
+    /// `ExtensionTarget`s for the opening of lookups and cross-table lookups `Z` polynomials at 1.
     pub ctl_zs_first: Vec<Target>,
+    /// `ExtensionTarget`s for the opening of quotient polynomials at `zeta`.
     pub quotient_polys: Vec<ExtensionTarget<D>>,
 }
 
 impl<const D: usize> StarkOpeningSetTarget<D> {
-    pub fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
+    /// Serializes a STARK's opening set.
+    pub(crate) fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
         buffer.write_target_ext_vec(&self.local_values)?;
         buffer.write_target_ext_vec(&self.next_values)?;
         buffer.write_target_ext_vec(&self.auxiliary_polys)?;
@@ -894,7 +1095,8 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
         Ok(())
     }
 
-    pub fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
+    /// Deserializes a STARK's opening set.
+    pub(crate) fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
         let local_values = buffer.read_target_ext_vec::<D>()?;
         let next_values = buffer.read_target_ext_vec::<D>()?;
         let auxiliary_polys = buffer.read_target_ext_vec::<D>()?;
@@ -912,6 +1114,9 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
         })
     }
 
+    /// Circuit version of `to_fri_openings`for `FriOpenings`.
+    /// Constructs the `Target`s the circuit version of FRI.
+    /// All openings but `ctl_zs_first` are grouped together.
     pub(crate) fn to_fri_openings(&self, zero: Target) -> FriOpeningsTarget<D> {
         let zeta_batch = FriOpeningBatchTarget {
             values: self
diff --git a/evm/src/prover.rs b/evm/src/prover.rs
index c5729a573f..f376b8cd28 100644
--- a/evm/src/prover.rs
+++ b/evm/src/prover.rs
@@ -1,4 +1,7 @@
-use anyhow::{ensure, Result};
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+
+use anyhow::{anyhow, ensure, Result};
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use plonky2::field::extension::Extendable;
@@ -26,7 +29,6 @@ use crate::cross_table_lookup::{
     GrandProductChallengeSet,
 };
 use crate::evaluation_frame::StarkEvaluationFrame;
-use crate::generation::outputs::GenerationOutputs;
 use crate::generation::{generate_traces, GenerationInputs};
 use crate::get_challenges::observe_public_values;
 use crate::lookup::{lookup_helper_columns, Lookup, LookupCheckVars};
@@ -44,35 +46,29 @@ pub fn prove<F, C, const D: usize>(
     config: &StarkConfig,
     inputs: GenerationInputs,
     timing: &mut TimingTree,
+    abort_signal: Option<Arc<AtomicBool>>,
 ) -> Result<AllProof<F, C, D>>
-where
-    F: RichField + Extendable<D>,
-    C: GenericConfig<D, F = F>,
-{
-    let (proof, _outputs) = prove_with_outputs(all_stark, config, inputs, timing)?;
-    Ok(proof)
-}
-
-/// Generate traces, then create all STARK proofs. Returns information about the post-state,
-/// intended for debugging, in addition to the proof.
-pub fn prove_with_outputs<F, C, const D: usize>(
-    all_stark: &AllStark<F, D>,
-    config: &StarkConfig,
-    inputs: GenerationInputs,
-    timing: &mut TimingTree,
-) -> Result<(AllProof<F, C, D>, GenerationOutputs)>
 where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
 {
     timed!(timing, "build kernel", Lazy::force(&KERNEL));
-    let (traces, public_values, outputs) = timed!(
+    let (traces, public_values) = timed!(
         timing,
         "generate all traces",
         generate_traces(all_stark, inputs, config, timing)?
     );
-    let proof = prove_with_traces(all_stark, config, traces, public_values, timing)?;
-    Ok((proof, outputs))
+    check_abort_signal(abort_signal.clone())?;
+
+    let proof = prove_with_traces(
+        all_stark,
+        config,
+        traces,
+        public_values,
+        timing,
+        abort_signal,
+    )?;
+    Ok(proof)
 }
 
 /// Compute all STARK proofs.
@@ -82,6 +78,7 @@ pub(crate) fn prove_with_traces<F, C, const D: usize>(
     trace_poly_values: [Vec<PolynomialValues<F>>; NUM_TABLES],
     public_values: PublicValues,
     timing: &mut TimingTree,
+    abort_signal: Option<Arc<AtomicBool>>,
 ) -> Result<AllProof<F, C, D>>
 where
     F: RichField + Extendable<D>,
@@ -90,6 +87,7 @@ where
     let rate_bits = config.fri_config.rate_bits;
     let cap_height = config.fri_config.cap_height;
 
+    // For each STARK, we compute the polynomial commitments for the polynomials interpolating its trace.
     let trace_commitments = timed!(
         timing,
         "compute all trace commitments",
@@ -101,8 +99,6 @@ where
                     timing,
                     &format!("compute trace commitment for {:?}", table),
                     PolynomialBatch::<F, C, D>::from_values(
-                        // TODO: Cloning this isn't great; consider having `from_values` accept a reference,
-                        // or having `compute_permutation_z_polys` read trace values from the `PolynomialBatch`.
                         trace.clone(),
                         rate_bits,
                         false,
@@ -115,6 +111,7 @@ where
             .collect::<Vec<_>>()
     );
 
+    // Get the Merkle caps for all trace commitments and observe them.
     let trace_caps = trace_commitments
         .iter()
         .map(|c| c.merkle_tree.cap.clone())
@@ -127,14 +124,17 @@ where
     observe_public_values::<F, C, D>(&mut challenger, &public_values)
         .map_err(|_| anyhow::Error::msg("Invalid conversion of public values."))?;
 
+    // Get challenges for the cross-table lookups.
     let ctl_challenges = get_grand_product_challenge_set(&mut challenger, config.num_challenges);
+    // For each STARK, compute its cross-table lookup Z polynomials and get the associated `CtlData`.
     let ctl_data_per_table = timed!(
         timing,
         "compute CTL data",
-        cross_table_lookup_data::<F, D>(
+        cross_table_lookup_data::<F, D, NUM_TABLES>(
             &trace_poly_values,
             &all_stark.cross_table_lookups,
             &ctl_challenges,
+            all_stark.arithmetic_stark.constraint_degree()
         )
     );
 
@@ -149,7 +149,8 @@ where
             ctl_data_per_table,
             &mut challenger,
             &ctl_challenges,
-            timing
+            timing,
+            abort_signal,
         )?
     );
 
@@ -169,6 +170,13 @@ where
     })
 }
 
+/// Generates a proof for each STARK.
+/// At this stage, we have computed the trace polynomials commitments for the various STARKs,
+/// and we have the cross-table lookup data for each table, including the associated challenges.
+/// - `trace_poly_values` are the trace values for each STARK.
+/// - `trace_commitments` are the trace polynomials commitments for each STARK.
+/// - `ctl_data_per_table` group all the cross-table lookup data for each STARK.
+/// Each STARK uses its associated data to generate a proof.
 fn prove_with_commitments<F, C, const D: usize>(
     all_stark: &AllStark<F, D>,
     config: &StarkConfig,
@@ -178,6 +186,7 @@ fn prove_with_commitments<F, C, const D: usize>(
     challenger: &mut Challenger<F, C::Hasher>,
     ctl_challenges: &GrandProductChallengeSet<F>,
     timing: &mut TimingTree,
+    abort_signal: Option<Arc<AtomicBool>>,
 ) -> Result<[StarkProofWithMetadata<F, C, D>; NUM_TABLES]>
 where
     F: RichField + Extendable<D>,
@@ -195,6 +204,7 @@ where
             ctl_challenges,
             challenger,
             timing,
+            abort_signal.clone(),
         )?
     );
     let byte_packing_proof = timed!(
@@ -209,6 +219,7 @@ where
             ctl_challenges,
             challenger,
             timing,
+            abort_signal.clone(),
         )?
     );
     let cpu_proof = timed!(
@@ -223,6 +234,7 @@ where
             ctl_challenges,
             challenger,
             timing,
+            abort_signal.clone(),
         )?
     );
     let keccak_proof = timed!(
@@ -237,6 +249,7 @@ where
             ctl_challenges,
             challenger,
             timing,
+            abort_signal.clone(),
         )?
     );
     let keccak_sponge_proof = timed!(
@@ -251,6 +264,7 @@ where
             ctl_challenges,
             challenger,
             timing,
+            abort_signal.clone(),
         )?
     );
     let logic_proof = timed!(
@@ -265,6 +279,7 @@ where
             ctl_challenges,
             challenger,
             timing,
+            abort_signal.clone(),
         )?
     );
     let memory_proof = timed!(
@@ -279,6 +294,7 @@ where
             ctl_challenges,
             challenger,
             timing,
+            abort_signal,
         )?
     );
 
@@ -293,7 +309,10 @@ where
     ])
 }
 
-/// Compute proof for a single STARK table.
+/// Computes a proof for a single STARK table, including:
+/// - the initial state of the challenger,
+/// - all the requires Merkle caps,
+/// - all the required polynomial and FRI argument openings.
 pub(crate) fn prove_single_table<F, C, S, const D: usize>(
     stark: &S,
     config: &StarkConfig,
@@ -303,12 +322,15 @@ pub(crate) fn prove_single_table<F, C, S, const D: usize>(
     ctl_challenges: &GrandProductChallengeSet<F>,
     challenger: &mut Challenger<F, C::Hasher>,
     timing: &mut TimingTree,
+    abort_signal: Option<Arc<AtomicBool>>,
 ) -> Result<StarkProofWithMetadata<F, C, D>>
 where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
     S: Stark<F, D>,
 {
+    check_abort_signal(abort_signal.clone())?;
+
     let degree = trace_poly_values[0].len();
     let degree_bits = log2_strict(degree);
     let fri_params = config.fri_params(degree_bits);
@@ -350,15 +372,23 @@ where
     );
     let num_lookup_columns = lookup_helper_columns.as_ref().map(|v| v.len()).unwrap_or(0);
 
+    // We add CTLs to the permutation arguments so that we can batch commit to
+    // all auxiliary polynomials.
     let auxiliary_polys = match lookup_helper_columns {
-        None => ctl_data.z_polys(),
+        None => {
+            let mut ctl_polys = ctl_data.ctl_helper_polys();
+            ctl_polys.extend(ctl_data.ctl_z_polys());
+            ctl_polys
+        }
         Some(mut lookup_columns) => {
-            lookup_columns.extend(ctl_data.z_polys());
+            lookup_columns.extend(ctl_data.ctl_helper_polys());
+            lookup_columns.extend(ctl_data.ctl_z_polys());
             lookup_columns
         }
     };
     assert!(!auxiliary_polys.is_empty(), "No CTL?");
 
+    // Get the polynomial commitments for all auxiliary polynomials.
     let auxiliary_polys_commitment = timed!(
         timing,
         "compute auxiliary polynomials commitment",
@@ -377,6 +407,8 @@ where
 
     let alphas = challenger.get_n_challenges(config.num_challenges);
 
+    let num_ctl_polys = ctl_data.num_ctl_helper_polys();
+
     #[cfg(test)]
     {
         check_constraints(
@@ -389,9 +421,12 @@ where
             alphas.clone(),
             degree_bits,
             num_lookup_columns,
+            &num_ctl_polys,
         );
     }
 
+    check_abort_signal(abort_signal.clone())?;
+
     let quotient_polys = timed!(
         timing,
         "compute quotient polys",
@@ -405,6 +440,7 @@ where
             alphas,
             degree_bits,
             num_lookup_columns,
+            &num_ctl_polys,
             config,
         )
     );
@@ -424,6 +460,7 @@ where
             })
             .collect()
     );
+    // Commit to the quotient polynomials.
     let quotient_commitment = timed!(
         timing,
         "compute quotient commitment",
@@ -436,6 +473,7 @@ where
             None,
         )
     );
+    // Observe the quotient polynomials Merkle cap.
     let quotient_polys_cap = quotient_commitment.merkle_tree.cap.clone();
     challenger.observe_cap(&quotient_polys_cap);
 
@@ -449,6 +487,7 @@ where
         "Opening point is in the subgroup."
     );
 
+    // Compute all openings: evaluate all committed polynomials at `zeta` and, when necessary, at `g * zeta`.
     let openings = StarkOpeningSet::new(
         zeta,
         g,
@@ -456,7 +495,9 @@ where
         &auxiliary_polys_commitment,
         &quotient_commitment,
         stark.num_lookup_helper_columns(config),
+        &num_ctl_polys,
     );
+    // Get the FRI openings and observe them.
     challenger.observe_openings(&openings.to_fri_openings());
 
     let initial_merkle_trees = vec![
@@ -465,11 +506,13 @@ where
         &quotient_commitment,
     ];
 
+    check_abort_signal(abort_signal.clone())?;
+
     let opening_proof = timed!(
         timing,
         "compute openings proof",
         PolynomialBatch::prove_openings(
-            &stark.fri_instance(zeta, g, ctl_data.len(), config),
+            &stark.fri_instance(zeta, g, num_ctl_polys.iter().sum(), num_ctl_polys, config),
             &initial_merkle_trees,
             challenger,
             &fri_params,
@@ -497,11 +540,12 @@ fn compute_quotient_polys<'a, F, P, C, S, const D: usize>(
     trace_commitment: &'a PolynomialBatch<F, C, D>,
     auxiliary_polys_commitment: &'a PolynomialBatch<F, C, D>,
     lookup_challenges: Option<&'a Vec<F>>,
-    lookups: &[Lookup],
+    lookups: &[Lookup<F>],
     ctl_data: &CtlData<F>,
     alphas: Vec<F>,
     degree_bits: usize,
     num_lookup_columns: usize,
+    num_ctl_columns: &[usize],
     config: &StarkConfig,
 ) -> Vec<PolynomialCoeffs<F>>
 where
@@ -512,6 +556,7 @@ where
 {
     let degree = 1 << degree_bits;
     let rate_bits = config.fri_config.rate_bits;
+    let total_num_helper_cols: usize = num_ctl_columns.iter().sum();
 
     let quotient_degree_bits = log2_ceil(stark.quotient_degree_factor());
     assert!(
@@ -563,31 +608,62 @@ where
                 lagrange_basis_first,
                 lagrange_basis_last,
             );
+            // Get the local and next row evaluations for the current STARK.
             let vars = S::EvaluationFrame::from_values(
                 &get_trace_values_packed(i_start),
                 &get_trace_values_packed(i_next_start),
             );
+            // Get the local and next row evaluations for the permutation argument, as well as the associated challenges.
             let lookup_vars = lookup_challenges.map(|challenges| LookupCheckVars {
                 local_values: auxiliary_polys_commitment.get_lde_values_packed(i_start, step)
                     [..num_lookup_columns]
                     .to_vec(),
-                next_values: auxiliary_polys_commitment.get_lde_values_packed(i_next_start, step),
+                next_values: auxiliary_polys_commitment.get_lde_values_packed(i_next_start, step)
+                    [..num_lookup_columns]
+                    .to_vec(),
                 challenges: challenges.to_vec(),
             });
+
+            // Get all the data for this STARK's CTLs:
+            // - the local and next row evaluations for the CTL Z polynomials
+            // - the associated challenges.
+            // - for each CTL:
+            //     - the filter `Column`
+            //     - the `Column`s that form the looking/looked table.
+
+            let mut start_index = 0;
             let ctl_vars = ctl_data
                 .zs_columns
                 .iter()
                 .enumerate()
-                .map(|(i, zs_columns)| CtlCheckVars::<F, F, P, 1> {
-                    local_z: auxiliary_polys_commitment.get_lde_values_packed(i_start, step)
-                        [num_lookup_columns + i],
-                    next_z: auxiliary_polys_commitment.get_lde_values_packed(i_next_start, step)
-                        [num_lookup_columns + i],
-                    challenges: zs_columns.challenge,
-                    columns: &zs_columns.columns,
-                    filter_column: &zs_columns.filter_column,
+                .map(|(i, zs_columns)| {
+                    let num_ctl_helper_cols = num_ctl_columns[i];
+                    let helper_columns = auxiliary_polys_commitment
+                        .get_lde_values_packed(i_start, step)[num_lookup_columns
+                        + start_index
+                        ..num_lookup_columns + start_index + num_ctl_helper_cols]
+                        .to_vec();
+
+                    let ctl_vars = CtlCheckVars::<F, F, P, 1> {
+                        helper_columns,
+                        local_z: auxiliary_polys_commitment.get_lde_values_packed(i_start, step)
+                            [num_lookup_columns + total_num_helper_cols + i],
+                        next_z: auxiliary_polys_commitment
+                            .get_lde_values_packed(i_next_start, step)
+                            [num_lookup_columns + total_num_helper_cols + i],
+                        challenges: zs_columns.challenge,
+                        columns: zs_columns.columns.clone(),
+                        filter: zs_columns.filter.clone(),
+                    };
+
+                    start_index += num_ctl_helper_cols;
+
+                    ctl_vars
                 })
                 .collect::<Vec<_>>();
+
+            // Evaluate the polynomial combining all constraints, including those associated
+            // to the permutation and CTL arguments.
             eval_vanishing_poly::<F, F, P, S, D, 1>(
                 stark,
                 &vars,
@@ -620,6 +696,19 @@ where
         .collect()
 }
 
+/// Utility method that checks whether a kill signal has been emitted by one of the workers,
+/// which will result in an early abort for all the other processes involved in the same set
+/// of transactions.
+pub fn check_abort_signal(abort_signal: Option<Arc<AtomicBool>>) -> Result<()> {
+    if let Some(signal) = abort_signal {
+        if signal.load(Ordering::Relaxed) {
+            return Err(anyhow!("Stopping job from abort signal."));
+        }
+    }
+
+    Ok(())
+}
+
 #[cfg(test)]
 /// Check that all constraints evaluate to zero on `H`.
 /// Can also be used to check the degree of the constraints by evaluating on a larger subgroup.
@@ -628,11 +717,12 @@ fn check_constraints<'a, F, C, S, const D: usize>(
     trace_commitment: &'a PolynomialBatch<F, C, D>,
     auxiliary_commitment: &'a PolynomialBatch<F, C, D>,
     lookup_challenges: Option<&'a Vec<F>>,
-    lookups: &[Lookup],
+    lookups: &[Lookup<F>],
     ctl_data: &CtlData<F>,
     alphas: Vec<F>,
     degree_bits: usize,
     num_lookup_columns: usize,
+    num_ctl_helper_cols: &[usize],
 ) where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
@@ -641,6 +731,8 @@ fn check_constraints<'a, F, C, S, const D: usize>(
     let degree = 1 << degree_bits;
     let rate_bits = 0; // Set this to higher value to check constraint degree.
 
+    let total_num_helper_cols: usize = num_ctl_helper_cols.iter().sum();
+
     let size = degree << rate_bits;
     let step = 1 << rate_bits;
 
@@ -661,6 +753,7 @@ fn check_constraints<'a, F, C, S, const D: usize>(
         transpose(&values)
     };
 
+    // Get batch evaluations of the trace, permutation and CTL polynomials over our subgroup.
     let trace_subgroup_evals = get_subgroup_evals(trace_commitment);
     let auxiliary_subgroup_evals = get_subgroup_evals(auxiliary_commitment);
 
@@ -682,28 +775,49 @@ fn check_constraints<'a, F, C, S, const D: usize>(
                 lagrange_basis_first,
                 lagrange_basis_last,
             );
+            // Get the local and next row evaluations for the current STARK's trace.
             let vars = S::EvaluationFrame::from_values(
                 &trace_subgroup_evals[i],
                 &trace_subgroup_evals[i_next],
             );
+            // Get the local and next row evaluations for the current STARK's permutation argument.
             let lookup_vars = lookup_challenges.map(|challenges| LookupCheckVars {
                 local_values: auxiliary_subgroup_evals[i][..num_lookup_columns].to_vec(),
                 next_values: auxiliary_subgroup_evals[i_next][..num_lookup_columns].to_vec(),
                 challenges: challenges.to_vec(),
             });
 
+            // Get the local and next row evaluations for the current STARK's CTL Z polynomials.
+            let mut start_index = 0;
             let ctl_vars = ctl_data
                 .zs_columns
                 .iter()
                 .enumerate()
-                .map(|(iii, zs_columns)| CtlCheckVars::<F, F, F, 1> {
-                    local_z: auxiliary_subgroup_evals[i][num_lookup_columns + iii],
-                    next_z: auxiliary_subgroup_evals[i_next][num_lookup_columns + iii],
-                    challenges: zs_columns.challenge,
-                    columns: &zs_columns.columns,
-                    filter_column: &zs_columns.filter_column,
+                .map(|(iii, zs_columns)| {
+                    let num_helper_cols = num_ctl_helper_cols[iii];
+                    let helper_columns = auxiliary_subgroup_evals[i][num_lookup_columns
+                        + start_index
+                        ..num_lookup_columns + start_index + num_helper_cols]
+                        .to_vec();
+                    let ctl_vars = CtlCheckVars::<F, F, F, 1> {
+                        helper_columns,
+                        local_z: auxiliary_subgroup_evals[i]
+                            [num_lookup_columns + total_num_helper_cols + iii],
+                        next_z: auxiliary_subgroup_evals[i_next]
+                            [num_lookup_columns + total_num_helper_cols + iii],
+                        challenges: zs_columns.challenge,
+                        columns: zs_columns.columns.clone(),
+                        filter: zs_columns.filter.clone(),
+                    };
+
+                    start_index += num_helper_cols;
+
+                    ctl_vars
                 })
                 .collect::<Vec<_>>();
+
+            // Evaluate the polynomial combining all constraints, including those associated
+            // to the permutation and CTL arguments.
             eval_vanishing_poly::<F, F, F, S, D, 1>(
                 stark,
                 &vars,
@@ -716,6 +830,7 @@ fn check_constraints<'a, F, C, S, const D: usize>(
         })
         .collect::<Vec<_>>();
 
+    // Assert that all constraints evaluate to 0 over our subgroup.
     for v in constraint_values {
         assert!(
             v.iter().all(|x| x.is_zero()),
diff --git a/evm/src/recursive_verifier.rs b/evm/src/recursive_verifier.rs
index 9b294fc5ae..8053cbee58 100644
--- a/evm/src/recursive_verifier.rs
+++ b/evm/src/recursive_verifier.rs
@@ -1,4 +1,5 @@
-use std::fmt::Debug;
+use core::array::from_fn;
+use core::fmt::Debug;
 
 use anyhow::Result;
 use ethereum_types::{BigEndianHash, U256};
@@ -28,12 +29,11 @@ use plonky2_util::log2_ceil;
 use crate::all_stark::Table;
 use crate::config::StarkConfig;
 use crate::constraint_consumer::RecursiveConstraintConsumer;
+use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
-use crate::cross_table_lookup::{
-    CrossTableLookup, CtlCheckVarsTarget, GrandProductChallenge, GrandProductChallengeSet,
-};
+use crate::cross_table_lookup::{CrossTableLookup, CtlCheckVarsTarget, GrandProductChallengeSet};
 use crate::evaluation_frame::StarkEvaluationFrame;
-use crate::lookup::LookupCheckVarsTarget;
+use crate::lookup::{GrandProductChallenge, LookupCheckVarsTarget};
 use crate::memory::segments::Segment;
 use crate::memory::VALUE_LIMBS;
 use crate::proof::{
@@ -110,7 +110,7 @@ where
     C: GenericConfig<D, F = F>,
     C::Hasher: AlgebraicHasher<F>,
 {
-    pub fn to_buffer(
+    pub(crate) fn to_buffer(
         &self,
         buffer: &mut Vec<u8>,
         gate_serializer: &dyn GateSerializer<F, D>,
@@ -124,7 +124,7 @@ where
         Ok(())
     }
 
-    pub fn from_buffer(
+    pub(crate) fn from_buffer(
         buffer: &mut Buffer,
         gate_serializer: &dyn GateSerializer<F, D>,
         generator_serializer: &dyn WitnessGeneratorSerializer<F, D>,
@@ -227,10 +227,24 @@ where
     let zero_target = builder.zero();
 
     let num_lookup_columns = stark.num_lookup_helper_columns(inner_config);
-    let num_ctl_zs =
-        CrossTableLookup::num_ctl_zs(cross_table_lookups, table, inner_config.num_challenges);
-    let proof_target =
-        add_virtual_stark_proof(&mut builder, stark, inner_config, degree_bits, num_ctl_zs);
+    let (total_num_helpers, num_ctl_zs, num_helpers_by_ctl) =
+        CrossTableLookup::num_ctl_helpers_zs_all(
+            cross_table_lookups,
+            *table,
+            inner_config.num_challenges,
+            stark.constraint_degree(),
+        );
+    let num_ctl_helper_zs = num_ctl_zs + total_num_helpers;
+
+    let proof_target = add_virtual_stark_proof(
+        &mut builder,
+        stark,
+        inner_config,
+        degree_bits,
+        num_ctl_helper_zs,
+        num_ctl_zs,
+    );
+
     builder.register_public_inputs(
         &proof_target
             .trace_cap
@@ -250,11 +264,13 @@ where
     };
 
     let ctl_vars = CtlCheckVarsTarget::from_proof(
-        table,
+        *table,
         &proof_target,
         cross_table_lookups,
         &ctl_challenges_target,
         num_lookup_columns,
+        total_num_helpers,
+        &num_helpers_by_ctl,
     );
 
     let init_challenger_state_target =
@@ -328,6 +344,11 @@ fn verify_stark_proof_with_challenges_circuit<
     let zero = builder.zero();
     let one = builder.one_extension();
 
+    let num_ctl_polys = ctl_vars
+        .iter()
+        .map(|ctl| ctl.helper_columns.len())
+        .sum::<usize>();
+
     let StarkOpeningSetTarget {
         local_values,
         next_values,
@@ -405,6 +426,7 @@ fn verify_stark_proof_with_challenges_circuit<
         builder,
         challenges.stark_zeta,
         F::primitive_root_of_unity(degree_bits),
+        num_ctl_polys,
         ctl_zs_first.len(),
         inner_config,
     );
@@ -418,118 +440,116 @@ fn verify_stark_proof_with_challenges_circuit<
     );
 }
 
-/// Recursive version of `get_memory_extra_looking_products`.
-pub(crate) fn get_memory_extra_looking_products_circuit<
-    F: RichField + Extendable<D>,
-    const D: usize,
->(
+/// Recursive version of `get_memory_extra_looking_sum`.
+pub(crate) fn get_memory_extra_looking_sum_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     public_values: &PublicValuesTarget,
     challenge: GrandProductChallenge<Target>,
 ) -> Target {
-    let mut product = builder.one();
+    let mut sum = builder.zero();
 
     // Add metadata writes.
     let block_fields_scalars = [
         (
-            GlobalMetadata::BlockTimestamp as usize,
+            GlobalMetadata::BlockTimestamp,
             public_values.block_metadata.block_timestamp,
         ),
         (
-            GlobalMetadata::BlockNumber as usize,
+            GlobalMetadata::BlockNumber,
             public_values.block_metadata.block_number,
         ),
         (
-            GlobalMetadata::BlockDifficulty as usize,
+            GlobalMetadata::BlockDifficulty,
             public_values.block_metadata.block_difficulty,
         ),
         (
-            GlobalMetadata::BlockChainId as usize,
+            GlobalMetadata::BlockGasLimit,
+            public_values.block_metadata.block_gaslimit,
+        ),
+        (
+            GlobalMetadata::BlockChainId,
             public_values.block_metadata.block_chain_id,
         ),
         (
-            GlobalMetadata::TxnNumberBefore as usize,
+            GlobalMetadata::BlockGasUsed,
+            public_values.block_metadata.block_gas_used,
+        ),
+        (
+            GlobalMetadata::BlockGasUsedBefore,
+            public_values.extra_block_data.gas_used_before,
+        ),
+        (
+            GlobalMetadata::BlockGasUsedAfter,
+            public_values.extra_block_data.gas_used_after,
+        ),
+        (
+            GlobalMetadata::TxnNumberBefore,
             public_values.extra_block_data.txn_number_before,
         ),
         (
-            GlobalMetadata::TxnNumberAfter as usize,
+            GlobalMetadata::TxnNumberAfter,
             public_values.extra_block_data.txn_number_after,
         ),
     ];
 
     // This contains the `block_beneficiary`, `block_random`, `block_base_fee`,
-    // `block_gaslimit`, `block_gas_used`, `block_blob_base_fee` as well as `cur_hash`,
-    // `gas_used_before` and `gas_used_after`.
-    let block_fields_arrays: [(usize, &[Target]); 9] = [
+    // `block_blob_base_fee` as well as `cur_hash`.
+    let block_fields_arrays: [(GlobalMetadata, &[Target]); 5] = [
         (
-            GlobalMetadata::BlockBeneficiary as usize,
+            GlobalMetadata::BlockBeneficiary,
             &public_values.block_metadata.block_beneficiary,
         ),
         (
-            GlobalMetadata::BlockRandom as usize,
+            GlobalMetadata::BlockRandom,
             &public_values.block_metadata.block_random,
         ),
         (
-            GlobalMetadata::BlockBaseFee as usize,
+            GlobalMetadata::BlockBaseFee,
             &public_values.block_metadata.block_base_fee,
         ),
         (
-            GlobalMetadata::BlockGasLimit as usize,
-            &public_values.block_metadata.block_gaslimit,
-        ),
-        (
-            GlobalMetadata::BlockGasUsed as usize,
-            &public_values.block_metadata.block_gas_used,
-        ),
-        (
-            GlobalMetadata::BlockBlobBaseFee as usize,
+            GlobalMetadata::BlockBlobBaseFee,
             &public_values.block_metadata.block_blob_base_fee,
         ),
         (
-            GlobalMetadata::BlockCurrentHash as usize,
+            GlobalMetadata::BlockCurrentHash,
             &public_values.block_hashes.cur_hash,
         ),
-        (
-            GlobalMetadata::BlockGasUsedBefore as usize,
-            &public_values.extra_block_data.gas_used_before,
-        ),
-        (
-            GlobalMetadata::BlockGasUsedAfter as usize,
-            &public_values.extra_block_data.gas_used_after,
-        ),
     ];
 
-    let metadata_segment = builder.constant(F::from_canonical_u32(Segment::GlobalMetadata as u32));
+    let metadata_segment =
+        builder.constant(F::from_canonical_usize(Segment::GlobalMetadata.unscale()));
     block_fields_scalars.map(|(field, target)| {
         // Each of those fields fit in 32 bits, hence in a single Target.
-        product = add_data_write(
+        sum = add_data_write(
             builder,
             challenge,
-            product,
+            sum,
             metadata_segment,
-            field,
+            field.unscale(),
             &[target],
         );
     });
 
     block_fields_arrays.map(|(field, targets)| {
-        product = add_data_write(
+        sum = add_data_write(
             builder,
             challenge,
-            product,
+            sum,
             metadata_segment,
-            field,
+            field.unscale(),
             targets,
         );
     });
 
     // Add block hashes writes.
-    let block_hashes_segment = builder.constant(F::from_canonical_u32(Segment::BlockHashes as u32));
+    let block_hashes_segment =
+        builder.constant(F::from_canonical_usize(Segment::BlockHashes.unscale()));
     for i in 0..256 {
-        product = add_data_write(
+        sum = add_data_write(
             builder,
             challenge,
-            product,
+            sum,
             block_hashes_segment,
             i,
             &public_values.block_hashes.prev_hashes[8 * i..8 * (i + 1)],
@@ -537,85 +557,86 @@ pub(crate) fn get_memory_extra_looking_products_circuit<
     }
 
     // Add block bloom filters writes.
-    let bloom_segment = builder.constant(F::from_canonical_u32(Segment::GlobalBlockBloom as u32));
+    let bloom_segment =
+        builder.constant(F::from_canonical_usize(Segment::GlobalBlockBloom.unscale()));
     for i in 0..8 {
-        product = add_data_write(
+        sum = add_data_write(
             builder,
             challenge,
-            product,
+            sum,
             bloom_segment,
             i,
             &public_values.block_metadata.block_bloom[i * 8..(i + 1) * 8],
         );
     }
-    for i in 0..8 {
-        product = add_data_write(
-            builder,
-            challenge,
-            product,
-            bloom_segment,
-            i + 8,
-            &public_values.extra_block_data.block_bloom_before[i * 8..(i + 1) * 8],
-        );
-    }
-
-    for i in 0..8 {
-        product = add_data_write(
-            builder,
-            challenge,
-            product,
-            bloom_segment,
-            i + 16,
-            &public_values.extra_block_data.block_bloom_after[i * 8..(i + 1) * 8],
-        );
-    }
 
     // Add trie roots writes.
     let trie_fields = [
         (
-            GlobalMetadata::StateTrieRootDigestBefore as usize,
+            GlobalMetadata::StateTrieRootDigestBefore,
             public_values.trie_roots_before.state_root,
         ),
         (
-            GlobalMetadata::TransactionTrieRootDigestBefore as usize,
+            GlobalMetadata::TransactionTrieRootDigestBefore,
             public_values.trie_roots_before.transactions_root,
         ),
         (
-            GlobalMetadata::ReceiptTrieRootDigestBefore as usize,
+            GlobalMetadata::ReceiptTrieRootDigestBefore,
             public_values.trie_roots_before.receipts_root,
         ),
         (
-            GlobalMetadata::StateTrieRootDigestAfter as usize,
+            GlobalMetadata::StateTrieRootDigestAfter,
             public_values.trie_roots_after.state_root,
         ),
         (
-            GlobalMetadata::TransactionTrieRootDigestAfter as usize,
+            GlobalMetadata::TransactionTrieRootDigestAfter,
             public_values.trie_roots_after.transactions_root,
         ),
         (
-            GlobalMetadata::ReceiptTrieRootDigestAfter as usize,
+            GlobalMetadata::ReceiptTrieRootDigestAfter,
             public_values.trie_roots_after.receipts_root,
         ),
     ];
 
     trie_fields.map(|(field, targets)| {
-        product = add_data_write(
+        sum = add_data_write(
             builder,
             challenge,
-            product,
+            sum,
             metadata_segment,
-            field,
+            field.unscale(),
             &targets,
         );
     });
 
-    product
+    // Add kernel hash and kernel length.
+    let kernel_hash_limbs = h256_limbs::<F>(KERNEL.code_hash);
+    let kernel_hash_targets: [Target; 8] = from_fn(|i| builder.constant(kernel_hash_limbs[i]));
+    sum = add_data_write(
+        builder,
+        challenge,
+        sum,
+        metadata_segment,
+        GlobalMetadata::KernelHash.unscale(),
+        &kernel_hash_targets,
+    );
+    let kernel_len_target = builder.constant(F::from_canonical_usize(KERNEL.code.len()));
+    sum = add_data_write(
+        builder,
+        challenge,
+        sum,
+        metadata_segment,
+        GlobalMetadata::KernelLen.unscale(),
+        &[kernel_len_target],
+    );
+
+    sum
 }
 
 fn add_data_write<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     challenge: GrandProductChallenge<Target>,
-    running_product: Target,
+    running_sum: Target,
     segment: Target,
     idx: usize,
     val: &[Target],
@@ -648,7 +669,8 @@ fn add_data_write<F: RichField + Extendable<D>, const D: usize>(
     builder.assert_one(row[12]);
 
     let combined = challenge.combine_base_circuit(builder, &row);
-    builder.mul(running_product, combined)
+    let inverse = builder.inverse(combined);
+    builder.add(running_sum, inverse)
 }
 
 fn eval_l_0_and_l_last_circuit<F: RichField + Extendable<D>, const D: usize>(
@@ -708,10 +730,10 @@ pub(crate) fn add_virtual_block_metadata<F: RichField + Extendable<D>, const D:
     let block_number = builder.add_virtual_public_input();
     let block_difficulty = builder.add_virtual_public_input();
     let block_random = builder.add_virtual_public_input_arr();
-    let block_gaslimit = builder.add_virtual_public_input_arr();
+    let block_gaslimit = builder.add_virtual_public_input();
     let block_chain_id = builder.add_virtual_public_input();
     let block_base_fee = builder.add_virtual_public_input_arr();
-    let block_gas_used = builder.add_virtual_public_input_arr();
+    let block_gas_used = builder.add_virtual_public_input();
     let block_blob_base_fee = builder.add_virtual_public_input_arr();
     let block_bloom = builder.add_virtual_public_input_arr();
     BlockMetadataTarget {
@@ -742,21 +764,17 @@ pub(crate) fn add_virtual_block_hashes<F: RichField + Extendable<D>, const D: us
 pub(crate) fn add_virtual_extra_block_data<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
 ) -> ExtraBlockDataTarget {
-    let genesis_state_trie_root = builder.add_virtual_public_input_arr();
+    let checkpoint_state_trie_root = builder.add_virtual_public_input_arr();
     let txn_number_before = builder.add_virtual_public_input();
     let txn_number_after = builder.add_virtual_public_input();
-    let gas_used_before = builder.add_virtual_public_input_arr();
-    let gas_used_after = builder.add_virtual_public_input_arr();
-    let block_bloom_before: [Target; 64] = builder.add_virtual_public_input_arr();
-    let block_bloom_after: [Target; 64] = builder.add_virtual_public_input_arr();
+    let gas_used_before = builder.add_virtual_public_input();
+    let gas_used_after = builder.add_virtual_public_input();
     ExtraBlockDataTarget {
-        genesis_state_trie_root,
+        checkpoint_state_trie_root,
         txn_number_before,
         txn_number_after,
         gas_used_before,
         gas_used_after,
-        block_bloom_before,
-        block_bloom_after,
     }
 }
 
@@ -769,6 +787,7 @@ pub(crate) fn add_virtual_stark_proof<
     stark: &S,
     config: &StarkConfig,
     degree_bits: usize,
+    num_ctl_helper_zs: usize,
     num_ctl_zs: usize,
 ) -> StarkProofTarget<D> {
     let fri_params = config.fri_params(degree_bits);
@@ -776,7 +795,7 @@ pub(crate) fn add_virtual_stark_proof<
 
     let num_leaves_per_oracle = vec![
         S::COLUMNS,
-        stark.num_lookup_helper_columns(config) + num_ctl_zs,
+        stark.num_lookup_helper_columns(config) + num_ctl_helper_zs,
         stark.quotient_degree_factor() * config.num_challenges,
     ];
 
@@ -786,7 +805,13 @@ pub(crate) fn add_virtual_stark_proof<
         trace_cap: builder.add_virtual_cap(cap_height),
         auxiliary_polys_cap,
         quotient_polys_cap: builder.add_virtual_cap(cap_height),
-        openings: add_virtual_stark_opening_set::<F, S, D>(builder, stark, num_ctl_zs, config),
+        openings: add_virtual_stark_opening_set::<F, S, D>(
+            builder,
+            stark,
+            num_ctl_helper_zs,
+            num_ctl_zs,
+            config,
+        ),
         opening_proof: builder.add_virtual_fri_proof(&num_leaves_per_oracle, &fri_params),
     }
 }
@@ -794,6 +819,7 @@ pub(crate) fn add_virtual_stark_proof<
 fn add_virtual_stark_opening_set<F: RichField + Extendable<D>, S: Stark<F, D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     stark: &S,
+    num_ctl_helper_zs: usize,
     num_ctl_zs: usize,
     config: &StarkConfig,
 ) -> StarkOpeningSetTarget<D> {
@@ -801,10 +827,12 @@ fn add_virtual_stark_opening_set<F: RichField + Extendable<D>, S: Stark<F, D>, c
     StarkOpeningSetTarget {
         local_values: builder.add_virtual_extension_targets(S::COLUMNS),
         next_values: builder.add_virtual_extension_targets(S::COLUMNS),
-        auxiliary_polys: builder
-            .add_virtual_extension_targets(stark.num_lookup_helper_columns(config) + num_ctl_zs),
-        auxiliary_polys_next: builder
-            .add_virtual_extension_targets(stark.num_lookup_helper_columns(config) + num_ctl_zs),
+        auxiliary_polys: builder.add_virtual_extension_targets(
+            stark.num_lookup_helper_columns(config) + num_ctl_helper_zs,
+        ),
+        auxiliary_polys_next: builder.add_virtual_extension_targets(
+            stark.num_lookup_helper_columns(config) + num_ctl_helper_zs,
+        ),
         ctl_zs_first: builder.add_virtual_targets(num_ctl_zs),
         quotient_polys: builder
             .add_virtual_extension_targets(stark.quotient_degree_factor() * num_challenges),
@@ -837,7 +865,7 @@ pub(crate) fn set_stark_proof_target<F, C: GenericConfig<D, F = F>, W, const D:
     set_fri_proof_target(witness, &proof_target.opening_proof, &proof.opening_proof);
 }
 
-pub(crate) fn set_public_value_targets<F, W, const D: usize>(
+pub fn set_public_value_targets<F, W, const D: usize>(
     witness: &mut W,
     public_values_target: &PublicValuesTarget,
     public_values: &PublicValues,
@@ -959,10 +987,10 @@ where
         &block_metadata_target.block_random,
         &h256_limbs(block_metadata.block_random),
     );
-    // Gaslimit fits in 2 limbs
-    let gaslimit = u256_to_u64(block_metadata.block_gaslimit)?;
-    witness.set_target(block_metadata_target.block_gaslimit[0], gaslimit.0);
-    witness.set_target(block_metadata_target.block_gaslimit[1], gaslimit.1);
+    witness.set_target(
+        block_metadata_target.block_gaslimit,
+        u256_to_u32(block_metadata.block_gaslimit)?,
+    );
     witness.set_target(
         block_metadata_target.block_chain_id,
         u256_to_u32(block_metadata.block_chain_id)?,
@@ -971,10 +999,10 @@ where
     let basefee = u256_to_u64(block_metadata.block_base_fee)?;
     witness.set_target(block_metadata_target.block_base_fee[0], basefee.0);
     witness.set_target(block_metadata_target.block_base_fee[1], basefee.1);
-    // Gas used fits in 2 limbs
-    let gas_used = u256_to_u64(block_metadata.block_gas_used)?;
-    witness.set_target(block_metadata_target.block_gas_used[0], gas_used.0);
-    witness.set_target(block_metadata_target.block_gas_used[1], gas_used.1);
+    witness.set_target(
+        block_metadata_target.block_gas_used,
+        u256_to_u32(block_metadata.block_gas_used)?,
+    );
     // Blobbasefee fits in 2 limbs
     let blob_basefee = u256_to_u64(block_metadata.block_blob_base_fee)?;
     witness.set_target(block_metadata_target.block_blob_base_fee[0], blob_basefee.0);
@@ -1017,8 +1045,8 @@ where
     W: Witness<F>,
 {
     witness.set_target_arr(
-        &ed_target.genesis_state_trie_root,
-        &h256_limbs::<F>(ed.genesis_state_trie_root),
+        &ed_target.checkpoint_state_trie_root,
+        &h256_limbs::<F>(ed.checkpoint_state_trie_root),
     );
     witness.set_target(
         ed_target.txn_number_before,
@@ -1028,29 +1056,8 @@ where
         ed_target.txn_number_after,
         u256_to_u32(ed.txn_number_after)?,
     );
-    // Gas used before/after fit in 2 limbs
-    let gas_used_before = u256_to_u64(ed.gas_used_before)?;
-    witness.set_target(ed_target.gas_used_before[0], gas_used_before.0);
-    witness.set_target(ed_target.gas_used_before[1], gas_used_before.1);
-    let gas_used_after = u256_to_u64(ed.gas_used_after)?;
-    witness.set_target(ed_target.gas_used_after[0], gas_used_after.0);
-    witness.set_target(ed_target.gas_used_after[1], gas_used_after.1);
-
-    let block_bloom_before = ed.block_bloom_before;
-    let mut block_bloom_limbs = [F::ZERO; 64];
-    for (i, limbs) in block_bloom_limbs.chunks_exact_mut(8).enumerate() {
-        limbs.copy_from_slice(&u256_limbs(block_bloom_before[i]));
-    }
-
-    witness.set_target_arr(&ed_target.block_bloom_before, &block_bloom_limbs);
-
-    let block_bloom_after = ed.block_bloom_after;
-    let mut block_bloom_limbs = [F::ZERO; 64];
-    for (i, limbs) in block_bloom_limbs.chunks_exact_mut(8).enumerate() {
-        limbs.copy_from_slice(&u256_limbs(block_bloom_after[i]));
-    }
-
-    witness.set_target_arr(&ed_target.block_bloom_after, &block_bloom_limbs);
+    witness.set_target(ed_target.gas_used_before, u256_to_u32(ed.gas_used_before)?);
+    witness.set_target(ed_target.gas_used_after, u256_to_u32(ed.gas_used_after)?);
 
     Ok(())
 }
diff --git a/evm/src/stark.rs b/evm/src/stark.rs
index 10f48eae47..5ff578f9fc 100644
--- a/evm/src/stark.rs
+++ b/evm/src/stark.rs
@@ -66,7 +66,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
 
     /// Evaluate constraints at a vector of points from the degree `D` extension field. This is like
     /// `eval_ext`, except in the context of a recursive circuit.
-    /// Note: constraints must be added through`yeld_constr.constraint(builder, constraint)` in the
+    /// Note: constraints must be added through`yield_constr.constraint(builder, constraint)` in the
     /// same order as they are given in `eval_packed_generic`.
     fn eval_ext_circuit(
         &self,
@@ -92,7 +92,8 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         &self,
         zeta: F::Extension,
         g: F,
-        num_ctl_zs: usize,
+        num_ctl_helpers: usize,
+        num_ctl_zs: Vec<usize>,
         config: &StarkConfig,
     ) -> FriInstanceInfo<F, D> {
         let trace_oracle = FriOracleInfo {
@@ -102,7 +103,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         let trace_info = FriPolynomialInfo::from_range(TRACE_ORACLE_INDEX, 0..Self::COLUMNS);
 
         let num_lookup_columns = self.num_lookup_helper_columns(config);
-        let num_auxiliary_polys = num_lookup_columns + num_ctl_zs;
+        let num_auxiliary_polys = num_lookup_columns + num_ctl_helpers + num_ctl_zs.len();
         let auxiliary_oracle = FriOracleInfo {
             num_polys: num_auxiliary_polys,
             blinding: false,
@@ -112,7 +113,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
 
         let ctl_zs_info = FriPolynomialInfo::from_range(
             AUXILIARY_ORACLE_INDEX,
-            num_lookup_columns..num_lookup_columns + num_ctl_zs,
+            num_lookup_columns + num_ctl_helpers..num_auxiliary_polys,
         );
 
         let num_quotient_polys = self.num_quotient_polys(config);
@@ -152,6 +153,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         builder: &mut CircuitBuilder<F, D>,
         zeta: ExtensionTarget<D>,
         g: F,
+        num_ctl_helper_polys: usize,
         num_ctl_zs: usize,
         inner_config: &StarkConfig,
     ) -> FriInstanceInfoTarget<D> {
@@ -162,7 +164,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         let trace_info = FriPolynomialInfo::from_range(TRACE_ORACLE_INDEX, 0..Self::COLUMNS);
 
         let num_lookup_columns = self.num_lookup_helper_columns(inner_config);
-        let num_auxiliary_polys = num_lookup_columns + num_ctl_zs;
+        let num_auxiliary_polys = num_lookup_columns + num_ctl_helper_polys + num_ctl_zs;
         let auxiliary_oracle = FriOracleInfo {
             num_polys: num_auxiliary_polys,
             blinding: false,
@@ -172,7 +174,8 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
 
         let ctl_zs_info = FriPolynomialInfo::from_range(
             AUXILIARY_ORACLE_INDEX,
-            num_lookup_columns..num_lookup_columns + num_ctl_zs,
+            num_lookup_columns + num_ctl_helper_polys
+                ..num_lookup_columns + num_ctl_helper_polys + num_ctl_zs,
         );
 
         let num_quotient_polys = self.num_quotient_polys(inner_config);
@@ -207,7 +210,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         }
     }
 
-    fn lookups(&self) -> Vec<Lookup> {
+    fn lookups(&self) -> Vec<Lookup<F>> {
         vec![]
     }
 
diff --git a/evm/src/stark_testing.rs b/evm/src/stark_testing.rs
index 5fe44127f9..3568f00433 100644
--- a/evm/src/stark_testing.rs
+++ b/evm/src/stark_testing.rs
@@ -18,7 +18,11 @@ const WITNESS_SIZE: usize = 1 << 5;
 
 /// Tests that the constraints imposed by the given STARK are low-degree by applying them to random
 /// low-degree witness polynomials.
-pub fn test_stark_low_degree<F: RichField + Extendable<D>, S: Stark<F, D>, const D: usize>(
+pub(crate) fn test_stark_low_degree<
+    F: RichField + Extendable<D>,
+    S: Stark<F, D>,
+    const D: usize,
+>(
     stark: S,
 ) -> Result<()> {
     let rate_bits = log2_ceil(stark.constraint_degree() + 1);
@@ -70,7 +74,7 @@ pub fn test_stark_low_degree<F: RichField + Extendable<D>, S: Stark<F, D>, const
 }
 
 /// Tests that the circuit constraints imposed by the given STARK are coherent with the native constraints.
-pub fn test_stark_circuit_constraints<
+pub(crate) fn test_stark_circuit_constraints<
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
     S: Stark<F, D>,
diff --git a/evm/src/util.rs b/evm/src/util.rs
index 08233056b1..aec2e63e17 100644
--- a/evm/src/util.rs
+++ b/evm/src/util.rs
@@ -1,4 +1,4 @@
-use std::mem::{size_of, transmute_copy, ManuallyDrop};
+use core::mem::{size_of, transmute_copy, ManuallyDrop};
 
 use ethereum_types::{H160, H256, U256};
 use itertools::Itertools;
@@ -14,7 +14,7 @@ use plonky2::util::transpose;
 use crate::witness::errors::ProgramError;
 
 /// Construct an integer from its constituent bits (in little-endian order)
-pub fn limb_from_bits_le<P: PackedField>(iter: impl IntoIterator<Item = P>) -> P {
+pub(crate) fn limb_from_bits_le<P: PackedField>(iter: impl IntoIterator<Item = P>) -> P {
     // TODO: This is technically wrong, as 1 << i won't be canonical for all fields...
     iter.into_iter()
         .enumerate()
@@ -23,7 +23,7 @@ pub fn limb_from_bits_le<P: PackedField>(iter: impl IntoIterator<Item = P>) -> P
 }
 
 /// Construct an integer from its constituent bits (in little-endian order): recursive edition
-pub fn limb_from_bits_le_recursive<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn limb_from_bits_le_recursive<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     iter: impl IntoIterator<Item = ExtensionTarget<D>>,
 ) -> ExtensionTarget<D> {
@@ -36,7 +36,7 @@ pub fn limb_from_bits_le_recursive<F: RichField + Extendable<D>, const D: usize>
 }
 
 /// A helper function to transpose a row-wise trace and put it in the format that `prove` expects.
-pub fn trace_rows_to_poly_values<F: Field, const COLUMNS: usize>(
+pub(crate) fn trace_rows_to_poly_values<F: Field, const COLUMNS: usize>(
     trace_rows: Vec<[F; COLUMNS]>,
 ) -> Vec<PolynomialValues<F>> {
     let trace_row_vecs = trace_rows.into_iter().map(|row| row.to_vec()).collect_vec();
@@ -75,7 +75,36 @@ pub(crate) fn u256_to_usize(u256: U256) -> Result<usize, ProgramError> {
     u256.try_into().map_err(|_| ProgramError::IntegerTooLarge)
 }
 
-#[allow(unused)] // TODO: Remove?
+/// Converts a `U256` to a `u8`, erroring in case of overflow instead of panicking.
+pub(crate) fn u256_to_u8(u256: U256) -> Result<u8, ProgramError> {
+    u256.try_into().map_err(|_| ProgramError::IntegerTooLarge)
+}
+
+/// Converts a `U256` to a `bool`, erroring in case of overflow instead of panicking.
+pub(crate) fn u256_to_bool(u256: U256) -> Result<bool, ProgramError> {
+    if u256 == U256::zero() {
+        Ok(false)
+    } else if u256 == U256::one() {
+        Ok(true)
+    } else {
+        Err(ProgramError::IntegerTooLarge)
+    }
+}
+
+/// Converts a `U256` to a `H160`, erroring in case of overflow instead of panicking.
+pub(crate) fn u256_to_h160(u256: U256) -> Result<H160, ProgramError> {
+    if u256.bits() / 8 > 20 {
+        return Err(ProgramError::IntegerTooLarge);
+    }
+    let mut bytes = [0u8; 32];
+    u256.to_big_endian(&mut bytes);
+    Ok(H160(
+        bytes[12..]
+            .try_into()
+            .expect("This conversion cannot fail."),
+    ))
+}
+
 /// Returns the 32-bit little-endian limbs of a `U256`.
 pub(crate) fn u256_limbs<F: Field>(u256: U256) -> [F; 8] {
     u256.0
@@ -91,7 +120,6 @@ pub(crate) fn u256_limbs<F: Field>(u256: U256) -> [F; 8] {
         .unwrap()
 }
 
-#[allow(unused)]
 /// Returns the 32-bit little-endian limbs of a `H256`.
 pub(crate) fn h256_limbs<F: Field>(h256: H256) -> [F; 8] {
     let mut temp_h256 = h256.0;
@@ -105,7 +133,6 @@ pub(crate) fn h256_limbs<F: Field>(h256: H256) -> [F; 8] {
         .unwrap()
 }
 
-#[allow(unused)]
 /// Returns the 32-bit limbs of a `U160`.
 pub(crate) fn h160_limbs<F: Field>(h160: H160) -> [F; 5] {
     h160.0
@@ -213,3 +240,25 @@ pub(crate) fn biguint_to_mem_vec(x: BigUint) -> Vec<U256> {
 pub(crate) fn h2u(h: H256) -> U256 {
     U256::from_big_endian(&h.0)
 }
+
+pub(crate) fn get_h160<F: RichField>(slice: &[F]) -> H160 {
+    H160::from_slice(
+        &slice
+            .iter()
+            .rev()
+            .map(|x| x.to_canonical_u64() as u32)
+            .flat_map(|limb| limb.to_be_bytes())
+            .collect_vec(),
+    )
+}
+
+pub(crate) fn get_h256<F: RichField>(slice: &[F]) -> H256 {
+    H256::from_slice(
+        &slice
+            .iter()
+            .rev()
+            .map(|x| x.to_canonical_u64() as u32)
+            .flat_map(|limb| limb.to_be_bytes())
+            .collect_vec(),
+    )
+}
diff --git a/evm/src/vanishing_poly.rs b/evm/src/vanishing_poly.rs
index 2ea6010e83..c1f2d0f92b 100644
--- a/evm/src/vanishing_poly.rs
+++ b/evm/src/vanishing_poly.rs
@@ -14,10 +14,12 @@ use crate::lookup::{
 };
 use crate::stark::Stark;
 
+/// Evaluates all constraint, permutation and cross-table lookup polynomials
+/// of the current STARK at the local and next values.
 pub(crate) fn eval_vanishing_poly<F, FE, P, S, const D: usize, const D2: usize>(
     stark: &S,
     vars: &S::EvaluationFrame<FE, P, D2>,
-    lookups: &[Lookup],
+    lookups: &[Lookup<F>],
     lookup_vars: Option<LookupCheckVars<F, FE, P, D2>>,
     ctl_vars: &[CtlCheckVars<F, FE, P, D2>],
     consumer: &mut ConstraintConsumer<P>,
@@ -27,8 +29,10 @@ pub(crate) fn eval_vanishing_poly<F, FE, P, S, const D: usize, const D2: usize>(
     P: PackedField<Scalar = FE>,
     S: Stark<F, D>,
 {
+    // Evaluate all of the STARK's table constraints.
     stark.eval_packed_generic(vars, consumer);
     if let Some(lookup_vars) = lookup_vars {
+        // Evaluate the STARK constraints related to the permutation arguments.
         eval_packed_lookups_generic::<F, FE, P, S, D, D2>(
             stark,
             lookups,
@@ -37,9 +41,18 @@ pub(crate) fn eval_vanishing_poly<F, FE, P, S, const D: usize, const D2: usize>(
             consumer,
         );
     }
-    eval_cross_table_lookup_checks::<F, FE, P, S, D, D2>(vars, ctl_vars, consumer);
+    // Evaluate the STARK constraints related to the cross-table lookups.
+    eval_cross_table_lookup_checks::<F, FE, P, S, D, D2>(
+        vars,
+        ctl_vars,
+        consumer,
+        stark.constraint_degree(),
+    );
 }
 
+/// Circuit version of `eval_vanishing_poly`.
+/// Evaluates all constraint, permutation and cross-table lookup polynomials
+/// of the current STARK at the local and next values.
 pub(crate) fn eval_vanishing_poly_circuit<F, S, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     stark: &S,
@@ -51,9 +64,18 @@ pub(crate) fn eval_vanishing_poly_circuit<F, S, const D: usize>(
     F: RichField + Extendable<D>,
     S: Stark<F, D>,
 {
+    // Evaluate all of the STARK's table constraints.
     stark.eval_ext_circuit(builder, vars, consumer);
     if let Some(lookup_vars) = lookup_vars {
+        // Evaluate all of the STARK's constraints related to the permutation argument.
         eval_ext_lookups_circuit::<F, S, D>(builder, stark, vars, lookup_vars, consumer);
     }
-    eval_cross_table_lookup_checks_circuit::<S, F, D>(builder, vars, ctl_vars, consumer);
+    // Evaluate all of the STARK's constraints related to the cross-table lookups.
+    eval_cross_table_lookup_checks_circuit::<S, F, D>(
+        builder,
+        vars,
+        ctl_vars,
+        consumer,
+        stark.constraint_degree(),
+    );
 }
diff --git a/evm/src/verifier.rs b/evm/src/verifier.rs
index 919528f189..5558227d25 100644
--- a/evm/src/verifier.rs
+++ b/evm/src/verifier.rs
@@ -1,4 +1,4 @@
-use std::any::type_name;
+use core::any::type_name;
 
 use anyhow::{ensure, Result};
 use ethereum_types::{BigEndianHash, U256};
@@ -13,12 +13,14 @@ use plonky2::plonk::plonk_common::reduce_with_powers;
 use crate::all_stark::{AllStark, Table, NUM_TABLES};
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
+use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cross_table_lookup::{
-    verify_cross_table_lookups, CtlCheckVars, GrandProductChallenge, GrandProductChallengeSet,
+    num_ctl_helper_columns_by_table, verify_cross_table_lookups, CtlCheckVars,
+    GrandProductChallengeSet,
 };
 use crate::evaluation_frame::StarkEvaluationFrame;
-use crate::lookup::LookupCheckVars;
+use crate::lookup::{GrandProductChallenge, LookupCheckVars};
 use crate::memory::segments::Segment;
 use crate::memory::VALUE_LIMBS;
 use crate::proof::{
@@ -55,11 +57,17 @@ where
         cross_table_lookups,
     } = all_stark;
 
+    let num_ctl_helper_cols = num_ctl_helper_columns_by_table(
+        cross_table_lookups,
+        all_stark.arithmetic_stark.constraint_degree(),
+    );
+
     let ctl_vars_per_table = CtlCheckVars::from_proofs(
         &all_proof.stark_proofs,
         cross_table_lookups,
         &ctl_challenges,
         &num_lookup_columns,
+        &num_ctl_helper_cols,
     );
 
     verify_stark_proof_with_challenges(
@@ -121,36 +129,36 @@ where
 
     let public_values = all_proof.public_values;
 
-    // Extra products to add to the looked last value.
+    // Extra sums to add to the looked last value.
     // Only necessary for the Memory values.
-    let mut extra_looking_products = vec![vec![F::ONE; config.num_challenges]; NUM_TABLES];
+    let mut extra_looking_sums = vec![vec![F::ZERO; config.num_challenges]; NUM_TABLES];
 
     // Memory
-    extra_looking_products[Table::Memory as usize] = (0..config.num_challenges)
-        .map(|i| get_memory_extra_looking_products(&public_values, ctl_challenges.challenges[i]))
+    extra_looking_sums[Table::Memory as usize] = (0..config.num_challenges)
+        .map(|i| get_memory_extra_looking_sum(&public_values, ctl_challenges.challenges[i]))
         .collect_vec();
 
-    verify_cross_table_lookups::<F, D>(
+    verify_cross_table_lookups::<F, D, NUM_TABLES>(
         cross_table_lookups,
         all_proof
             .stark_proofs
             .map(|p| p.proof.openings.ctl_zs_first),
-        extra_looking_products,
+        extra_looking_sums,
         config,
     )
 }
 
 /// Computes the extra product to multiply to the looked value. It contains memory operations not in the CPU trace:
-/// - block metadata writes before kernel bootstrapping,
-/// - trie roots writes before kernel bootstrapping.
-pub(crate) fn get_memory_extra_looking_products<F, const D: usize>(
+/// - block metadata writes,
+/// - trie roots writes.
+pub(crate) fn get_memory_extra_looking_sum<F, const D: usize>(
     public_values: &PublicValues,
     challenge: GrandProductChallenge<F>,
 ) -> F
 where
     F: RichField + Extendable<D>,
 {
-    let mut prod = F::ONE;
+    let mut sum = F::ZERO;
 
     // Add metadata and tries writes.
     let fields = [
@@ -238,42 +246,38 @@ where
             GlobalMetadata::ReceiptTrieRootDigestAfter,
             h2u(public_values.trie_roots_after.receipts_root),
         ),
+        (GlobalMetadata::KernelHash, h2u(KERNEL.code_hash)),
+        (GlobalMetadata::KernelLen, KERNEL.code.len().into()),
     ];
 
-    let segment = F::from_canonical_u32(Segment::GlobalMetadata as u32);
+    let segment = F::from_canonical_usize(Segment::GlobalMetadata.unscale());
 
-    fields.map(|(field, val)| prod = add_data_write(challenge, segment, prod, field as usize, val));
+    fields.map(|(field, val)| {
+        // These fields are already scaled by their segment, and are in context 0 (kernel).
+        sum = add_data_write(challenge, segment, sum, field.unscale(), val)
+    });
 
     // Add block bloom writes.
-    let bloom_segment = F::from_canonical_u32(Segment::GlobalBlockBloom as u32);
+    let bloom_segment = F::from_canonical_usize(Segment::GlobalBlockBloom.unscale());
     for index in 0..8 {
         let val = public_values.block_metadata.block_bloom[index];
-        prod = add_data_write(challenge, bloom_segment, prod, index, val);
-    }
-
-    for index in 0..8 {
-        let val = public_values.extra_block_data.block_bloom_before[index];
-        prod = add_data_write(challenge, bloom_segment, prod, index + 8, val);
-    }
-    for index in 0..8 {
-        let val = public_values.extra_block_data.block_bloom_after[index];
-        prod = add_data_write(challenge, bloom_segment, prod, index + 16, val);
+        sum = add_data_write(challenge, bloom_segment, sum, index, val);
     }
 
     // Add Blockhashes writes.
-    let block_hashes_segment = F::from_canonical_u32(Segment::BlockHashes as u32);
+    let block_hashes_segment = F::from_canonical_usize(Segment::BlockHashes.unscale());
     for index in 0..256 {
         let val = h2u(public_values.block_hashes.prev_hashes[index]);
-        prod = add_data_write(challenge, block_hashes_segment, prod, index, val);
+        sum = add_data_write(challenge, block_hashes_segment, sum, index, val);
     }
 
-    prod
+    sum
 }
 
 fn add_data_write<F, const D: usize>(
     challenge: GrandProductChallenge<F>,
     segment: F,
-    running_product: F,
+    running_sum: F,
     index: usize,
     val: U256,
 ) -> F
@@ -290,7 +294,7 @@ where
         row[j + 4] = F::from_canonical_u32((val >> (j * 32)).low_u32());
     }
     row[12] = F::ONE; // timestamp
-    running_product * challenge.combine(row.iter())
+    running_sum + challenge.combine(row.iter()).inverse()
 }
 
 pub(crate) fn verify_stark_proof_with_challenges<
@@ -307,13 +311,18 @@ pub(crate) fn verify_stark_proof_with_challenges<
     config: &StarkConfig,
 ) -> Result<()> {
     log::debug!("Checking proof: {}", type_name::<S>());
-    validate_proof_shape(stark, proof, config, ctl_vars.len())?;
+    let num_ctl_polys = ctl_vars
+        .iter()
+        .map(|ctl| ctl.helper_columns.len())
+        .sum::<usize>();
+    let num_ctl_z_polys = ctl_vars.len();
+    validate_proof_shape(stark, proof, config, num_ctl_polys, num_ctl_z_polys)?;
     let StarkOpeningSet {
         local_values,
         next_values,
         auxiliary_polys,
         auxiliary_polys_next,
-        ctl_zs_first,
+        ctl_zs_first: _,
         quotient_polys,
     } = &proof.openings;
     let vars = S::EvaluationFrame::from_values(local_values, next_values);
@@ -381,11 +390,16 @@ pub(crate) fn verify_stark_proof_with_challenges<
         proof.quotient_polys_cap.clone(),
     ];
 
+    let num_ctl_zs = ctl_vars
+        .iter()
+        .map(|ctl| ctl.helper_columns.len())
+        .collect::<Vec<_>>();
     verify_fri_proof::<F, C, D>(
         &stark.fri_instance(
             challenges.stark_zeta,
             F::primitive_root_of_unity(degree_bits),
-            ctl_zs_first.len(),
+            num_ctl_polys,
+            num_ctl_zs,
             config,
         ),
         &proof.openings.to_fri_openings(),
@@ -402,6 +416,7 @@ fn validate_proof_shape<F, C, S, const D: usize>(
     stark: &S,
     proof: &StarkProof<F, C, D>,
     config: &StarkConfig,
+    num_ctl_helpers: usize,
     num_ctl_zs: usize,
 ) -> anyhow::Result<()>
 where
@@ -431,7 +446,8 @@ where
     let degree_bits = proof.recover_degree_bits(config);
     let fri_params = config.fri_params(degree_bits);
     let cap_height = fri_params.config.cap_height;
-    let num_auxiliary = num_ctl_zs + stark.num_lookup_helper_columns(config);
+
+    let num_auxiliary = num_ctl_helpers + stark.num_lookup_helper_columns(config) + num_ctl_zs;
 
     ensure!(trace_cap.height() == cap_height);
     ensure!(auxiliary_polys_cap.height() == cap_height);
@@ -557,33 +573,26 @@ pub(crate) mod testutils {
                 GlobalMetadata::ReceiptTrieRootDigestAfter,
                 h2u(public_values.trie_roots_after.receipts_root),
             ),
+            (GlobalMetadata::KernelHash, h2u(KERNEL.code_hash)),
+            (GlobalMetadata::KernelLen, KERNEL.code.len().into()),
         ];
 
-        let segment = F::from_canonical_u32(Segment::GlobalMetadata as u32);
+        let segment = F::from_canonical_usize(Segment::GlobalMetadata.unscale());
         let mut extra_looking_rows = Vec::new();
 
         fields.map(|(field, val)| {
-            extra_looking_rows.push(add_extra_looking_row(segment, field as usize, val))
+            extra_looking_rows.push(add_extra_looking_row(segment, field.unscale(), val))
         });
 
         // Add block bloom writes.
-        let bloom_segment = F::from_canonical_u32(Segment::GlobalBlockBloom as u32);
+        let bloom_segment = F::from_canonical_usize(Segment::GlobalBlockBloom.unscale());
         for index in 0..8 {
             let val = public_values.block_metadata.block_bloom[index];
             extra_looking_rows.push(add_extra_looking_row(bloom_segment, index, val));
         }
 
-        for index in 0..8 {
-            let val = public_values.extra_block_data.block_bloom_before[index];
-            extra_looking_rows.push(add_extra_looking_row(bloom_segment, index + 8, val));
-        }
-        for index in 0..8 {
-            let val = public_values.extra_block_data.block_bloom_after[index];
-            extra_looking_rows.push(add_extra_looking_row(bloom_segment, index + 16, val));
-        }
-
         // Add Blockhashes writes.
-        let block_hashes_segment = F::from_canonical_u32(Segment::BlockHashes as u32);
+        let block_hashes_segment = F::from_canonical_usize(Segment::BlockHashes.unscale());
         for index in 0..256 {
             let val = h2u(public_values.block_hashes.prev_hashes[index]);
             extra_looking_rows.push(add_extra_looking_row(block_hashes_segment, index, val));
diff --git a/evm/src/witness/errors.rs b/evm/src/witness/errors.rs
index 8186246035..1b266aefde 100644
--- a/evm/src/witness/errors.rs
+++ b/evm/src/witness/errors.rs
@@ -1,6 +1,5 @@
 use ethereum_types::U256;
 
-#[allow(dead_code)]
 #[derive(Debug)]
 pub enum ProgramError {
     OutOfGas,
@@ -31,8 +30,12 @@ pub enum MemoryError {
 pub enum ProverInputError {
     OutOfMptData,
     OutOfRlpData,
+    OutOfWithdrawalData,
     CodeHashNotFound,
     InvalidMptInput,
     InvalidInput,
     InvalidFunction,
+    NumBitsError,
+    InvalidJumpDestination,
+    InvalidJumpdestSimulation,
 }
diff --git a/evm/src/witness/gas.rs b/evm/src/witness/gas.rs
index 6f63a97957..54597a3ebc 100644
--- a/evm/src/witness/gas.rs
+++ b/evm/src/witness/gas.rs
@@ -1,14 +1,14 @@
 use crate::witness::operation::Operation;
 
-const KERNEL_ONLY_INSTR: u64 = 0;
-const G_JUMPDEST: u64 = 1;
-const G_BASE: u64 = 2;
-const G_VERYLOW: u64 = 3;
-const G_LOW: u64 = 5;
-const G_MID: u64 = 8;
-const G_HIGH: u64 = 10;
+pub(crate) const KERNEL_ONLY_INSTR: u64 = 0;
+pub(crate) const G_JUMPDEST: u64 = 1;
+pub(crate) const G_BASE: u64 = 2;
+pub(crate) const G_VERYLOW: u64 = 3;
+pub(crate) const G_LOW: u64 = 5;
+pub(crate) const G_MID: u64 = 8;
+pub(crate) const G_HIGH: u64 = 10;
 
-pub(crate) fn gas_to_charge(op: Operation) -> u64 {
+pub(crate) const fn gas_to_charge(op: Operation) -> u64 {
     use crate::arithmetic::BinaryOperator::*;
     use crate::arithmetic::TernaryOperator::*;
     use crate::witness::operation::Operation::*;
@@ -48,7 +48,7 @@ pub(crate) fn gas_to_charge(op: Operation) -> u64 {
         GetContext => KERNEL_ONLY_INSTR,
         SetContext => KERNEL_ONLY_INSTR,
         Mload32Bytes => KERNEL_ONLY_INSTR,
-        Mstore32Bytes => KERNEL_ONLY_INSTR,
+        Mstore32Bytes(_) => KERNEL_ONLY_INSTR,
         ExitKernel => KERNEL_ONLY_INSTR,
         MloadGeneral => KERNEL_ONLY_INSTR,
         MstoreGeneral => KERNEL_ONLY_INSTR,
diff --git a/evm/src/witness/memory.rs b/evm/src/witness/memory.rs
index 5d589934a0..e6cb14f987 100644
--- a/evm/src/witness/memory.rs
+++ b/evm/src/witness/memory.rs
@@ -3,43 +3,47 @@ use ethereum_types::U256;
 use crate::cpu::membus::{NUM_CHANNELS, NUM_GP_CHANNELS};
 
 #[derive(Clone, Copy, Debug)]
-pub enum MemoryChannel {
+pub(crate) enum MemoryChannel {
     Code,
     GeneralPurpose(usize),
+    PartialChannel,
 }
 
-use MemoryChannel::{Code, GeneralPurpose};
+use MemoryChannel::{Code, GeneralPurpose, PartialChannel};
 
+use super::operation::CONTEXT_SCALING_FACTOR;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
-use crate::memory::segments::Segment;
+use crate::memory::segments::{Segment, SEGMENT_SCALING_FACTOR};
 use crate::witness::errors::MemoryError::{ContextTooLarge, SegmentTooLarge, VirtTooLarge};
 use crate::witness::errors::ProgramError;
 use crate::witness::errors::ProgramError::MemoryError;
 
 impl MemoryChannel {
-    pub fn index(&self) -> usize {
+    pub(crate) fn index(&self) -> usize {
         match *self {
             Code => 0,
             GeneralPurpose(n) => {
                 assert!(n < NUM_GP_CHANNELS);
                 n + 1
             }
+            PartialChannel => NUM_GP_CHANNELS + 1,
         }
     }
 }
 
 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
-pub struct MemoryAddress {
+pub(crate) struct MemoryAddress {
     pub(crate) context: usize,
     pub(crate) segment: usize,
     pub(crate) virt: usize,
 }
 
 impl MemoryAddress {
-    pub(crate) fn new(context: usize, segment: Segment, virt: usize) -> Self {
+    pub(crate) const fn new(context: usize, segment: Segment, virt: usize) -> Self {
         Self {
             context,
-            segment: segment as usize,
+            // segment is scaled
+            segment: segment.unscale(),
             virt,
         }
     }
@@ -67,19 +71,30 @@ impl MemoryAddress {
         })
     }
 
+    /// Creates a new `MemoryAddress` from a bundled address fitting a `U256`.
+    /// It will recover the virtual offset as the lowest 32-bit limb, the segment
+    /// as the next limb, and the context as the next one.
+    pub(crate) fn new_bundle(addr: U256) -> Result<Self, ProgramError> {
+        let virt = addr.low_u32().into();
+        let segment = (addr >> SEGMENT_SCALING_FACTOR).low_u32().into();
+        let context = (addr >> CONTEXT_SCALING_FACTOR).low_u32().into();
+
+        Self::new_u256s(context, segment, virt)
+    }
+
     pub(crate) fn increment(&mut self) {
         self.virt = self.virt.saturating_add(1);
     }
 }
 
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub enum MemoryOpKind {
+pub(crate) enum MemoryOpKind {
     Read,
     Write,
 }
 
 #[derive(Clone, Copy, Debug)]
-pub struct MemoryOp {
+pub(crate) struct MemoryOp {
     /// true if this is an actual memory operation, or false if it's a padding row.
     pub filter: bool,
     pub timestamp: usize,
@@ -88,7 +103,7 @@ pub struct MemoryOp {
     pub value: U256,
 }
 
-pub static DUMMY_MEMOP: MemoryOp = MemoryOp {
+pub(crate) static DUMMY_MEMOP: MemoryOp = MemoryOp {
     filter: false,
     timestamp: 0,
     address: MemoryAddress {
@@ -101,7 +116,7 @@ pub static DUMMY_MEMOP: MemoryOp = MemoryOp {
 };
 
 impl MemoryOp {
-    pub fn new(
+    pub(crate) fn new(
         channel: MemoryChannel,
         clock: usize,
         address: MemoryAddress,
@@ -118,7 +133,11 @@ impl MemoryOp {
         }
     }
 
-    pub(crate) fn new_dummy_read(address: MemoryAddress, timestamp: usize, value: U256) -> Self {
+    pub(crate) const fn new_dummy_read(
+        address: MemoryAddress,
+        timestamp: usize,
+        value: U256,
+    ) -> Self {
         Self {
             filter: false,
             timestamp,
@@ -128,7 +147,7 @@ impl MemoryOp {
         }
     }
 
-    pub(crate) fn sorting_key(&self) -> (usize, usize, usize, usize) {
+    pub(crate) const fn sorting_key(&self) -> (usize, usize, usize, usize) {
         (
             self.address.context,
             self.address.segment,
@@ -139,19 +158,19 @@ impl MemoryOp {
 }
 
 #[derive(Clone, Debug)]
-pub struct MemoryState {
+pub(crate) struct MemoryState {
     pub(crate) contexts: Vec<MemoryContextState>,
 }
 
 impl MemoryState {
-    pub fn new(kernel_code: &[u8]) -> Self {
+    pub(crate) fn new(kernel_code: &[u8]) -> Self {
         let code_u256s = kernel_code.iter().map(|&x| x.into()).collect();
         let mut result = Self::default();
-        result.contexts[0].segments[Segment::Code as usize].content = code_u256s;
+        result.contexts[0].segments[Segment::Code.unscale()].content = code_u256s;
         result
     }
 
-    pub fn apply_ops(&mut self, ops: &[MemoryOp]) {
+    pub(crate) fn apply_ops(&mut self, ops: &[MemoryOp]) {
         for &op in ops {
             let MemoryOp {
                 address,
@@ -165,12 +184,17 @@ impl MemoryState {
         }
     }
 
-    pub fn get(&self, address: MemoryAddress) -> U256 {
+    pub(crate) fn get(&self, address: MemoryAddress) -> U256 {
         if address.context >= self.contexts.len() {
             return U256::zero();
         }
 
         let segment = Segment::all()[address.segment];
+
+        if let Some(constant) = Segment::constant(&segment, address.virt) {
+            return constant;
+        }
+
         let val = self.contexts[address.context].segments[address.segment].get(address.virt);
         assert!(
             val.bits() <= segment.bit_range(),
@@ -182,12 +206,21 @@ impl MemoryState {
         val
     }
 
-    pub fn set(&mut self, address: MemoryAddress, val: U256) {
+    pub(crate) fn set(&mut self, address: MemoryAddress, val: U256) {
         while address.context >= self.contexts.len() {
             self.contexts.push(MemoryContextState::default());
         }
 
         let segment = Segment::all()[address.segment];
+
+        if let Some(constant) = Segment::constant(&segment, address.virt) {
+            assert!(
+                constant == val,
+                "Attempting to set constant {} to incorrect value",
+                address.virt
+            );
+            return;
+        }
         assert!(
             val.bits() <= segment.bit_range(),
             "Value {} exceeds {:?} range of {} bits",
@@ -198,12 +231,9 @@ impl MemoryState {
         self.contexts[address.context].segments[address.segment].set(address.virt, val);
     }
 
+    // These fields are already scaled by their respective segment.
     pub(crate) fn read_global_metadata(&self, field: GlobalMetadata) -> U256 {
-        self.get(MemoryAddress::new(
-            0,
-            Segment::GlobalMetadata,
-            field as usize,
-        ))
+        self.get(MemoryAddress::new_bundle(U256::from(field as usize)).unwrap())
     }
 }
 
diff --git a/evm/src/witness/mod.rs b/evm/src/witness/mod.rs
index fbb88a719c..a38a552299 100644
--- a/evm/src/witness/mod.rs
+++ b/evm/src/witness/mod.rs
@@ -1,7 +1,7 @@
 pub(crate) mod errors;
-mod gas;
+pub(crate) mod gas;
 pub(crate) mod memory;
-mod operation;
+pub(crate) mod operation;
 pub(crate) mod state;
 pub(crate) mod traces;
 pub mod transition;
diff --git a/evm/src/witness/operation.rs b/evm/src/witness/operation.rs
index a503ab496c..8c09fa00a2 100644
--- a/evm/src/witness/operation.rs
+++ b/evm/src/witness/operation.rs
@@ -3,7 +3,10 @@ use itertools::Itertools;
 use keccak_hash::keccak;
 use plonky2::field::types::Field;
 
-use super::util::{byte_packing_log, byte_unpacking_log, push_no_write, push_with_write};
+use super::util::{
+    byte_packing_log, byte_unpacking_log, mem_read_with_log, mem_write_log,
+    mem_write_partial_log_and_fill, push_no_write, push_with_write,
+};
 use crate::arithmetic::BinaryOperator;
 use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::kernel::aggregator::KERNEL;
@@ -11,16 +14,16 @@ use crate::cpu::kernel::assembler::BYTES_PER_OFFSET;
 use crate::cpu::kernel::constants::context_metadata::ContextMetadata;
 use crate::cpu::membus::NUM_GP_CHANNELS;
 use crate::cpu::simple_logic::eq_iszero::generate_pinv_diff;
-use crate::cpu::stack_bounds::MAX_USER_STACK_SIZE;
+use crate::cpu::stack::MAX_USER_STACK_SIZE;
 use crate::extension_tower::BN_BASE;
 use crate::generation::state::GenerationState;
 use crate::memory::segments::Segment;
 use crate::util::u256_to_usize;
-use crate::witness::errors::MemoryError::{ContextTooLarge, SegmentTooLarge, VirtTooLarge};
+use crate::witness::errors::MemoryError::VirtTooLarge;
 use crate::witness::errors::ProgramError;
-use crate::witness::errors::ProgramError::MemoryError;
 use crate::witness::memory::{MemoryAddress, MemoryChannel, MemoryOp, MemoryOpKind};
 use crate::witness::operation::MemoryChannel::GeneralPurpose;
+use crate::witness::transition::fill_stack_fields;
 use crate::witness::util::{
     keccak_sponge_log, mem_read_gp_with_log_and_fill, mem_write_gp_log_and_fill,
     stack_pop_with_log_and_fill,
@@ -49,12 +52,19 @@ pub(crate) enum Operation {
     GetContext,
     SetContext,
     Mload32Bytes,
-    Mstore32Bytes,
+    Mstore32Bytes(u8),
     ExitKernel,
     MloadGeneral,
     MstoreGeneral,
 }
 
+// Contexts in the kernel are shifted by 2^64, so that they can be combined with
+// the segment and virtual address components in a single U256 word.
+pub(crate) const CONTEXT_SCALING_FACTOR: usize = 64;
+
+/// Adds a CPU row filled with the two inputs and the output of a logic operation.
+/// Generates a new logic operation and adds it to the vector of operation in `LogicStark`.
+/// Adds three memory read operations to `MemoryStark`: for the two inputs and the output.
 pub(crate) fn generate_binary_logic_op<F: Field>(
     op: logic::Op,
     state: &mut GenerationState<F>,
@@ -63,7 +73,7 @@ pub(crate) fn generate_binary_logic_op<F: Field>(
     let [(in0, _), (in1, log_in1)] = stack_pop_with_log_and_fill::<2, _>(state, &mut row)?;
     let operation = logic::Operation::new(op, in0, in1);
 
-    push_no_write(state, &mut row, operation.result, Some(NUM_GP_CHANNELS - 1));
+    push_no_write(state, operation.result);
 
     state.traces.push_logic(operation);
     state.traces.push_memory(log_in1);
@@ -92,12 +102,7 @@ pub(crate) fn generate_binary_arithmetic_op<F: Field>(
         }
     }
 
-    push_no_write(
-        state,
-        &mut row,
-        operation.result(),
-        Some(NUM_GP_CHANNELS - 1),
-    );
+    push_no_write(state, operation.result());
 
     state.traces.push_arithmetic(operation);
     state.traces.push_memory(log_in1);
@@ -114,12 +119,7 @@ pub(crate) fn generate_ternary_arithmetic_op<F: Field>(
         stack_pop_with_log_and_fill::<3, _>(state, &mut row)?;
     let operation = arithmetic::Operation::ternary(operator, input0, input1, input2);
 
-    push_no_write(
-        state,
-        &mut row,
-        operation.result(),
-        Some(NUM_GP_CHANNELS - 1),
-    );
+    push_no_write(state, operation.result());
 
     state.traces.push_arithmetic(operation);
     state.traces.push_memory(log_in1);
@@ -132,12 +132,10 @@ pub(crate) fn generate_keccak_general<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
 ) -> Result<(), ProgramError> {
-    row.is_keccak_sponge = F::ONE;
-    let [(context, _), (segment, log_in1), (base_virt, log_in2), (len, log_in3)] =
-        stack_pop_with_log_and_fill::<4, _>(state, &mut row)?;
+    let [(addr, _), (len, log_in1)] = stack_pop_with_log_and_fill::<2, _>(state, &mut row)?;
     let len = u256_to_usize(len)?;
 
-    let base_address = MemoryAddress::new_u256s(context, segment, base_virt)?;
+    let base_address = MemoryAddress::new_bundle(addr)?;
     let input = (0..len)
         .map(|i| {
             let address = MemoryAddress {
@@ -151,13 +149,11 @@ pub(crate) fn generate_keccak_general<F: Field>(
     log::debug!("Hashing {:?}", input);
 
     let hash = keccak(&input);
-    push_no_write(state, &mut row, hash.into_uint(), Some(NUM_GP_CHANNELS - 1));
+    push_no_write(state, hash.into_uint());
 
     keccak_sponge_log(state, base_address, input);
 
     state.traces.push_memory(log_in1);
-    state.traces.push_memory(log_in2);
-    state.traces.push_memory(log_in3);
     state.traces.push_cpu(row);
     Ok(())
 }
@@ -169,7 +165,22 @@ pub(crate) fn generate_prover_input<F: Field>(
     let pc = state.registers.program_counter;
     let input_fn = &KERNEL.prover_inputs[&pc];
     let input = state.prover_input(input_fn)?;
+    let opcode = 0x49.into();
+    // `ArithmeticStark` range checks `mem_channels[0]`, which contains
+    // the top of the stack, `mem_channels[1]`, `mem_channels[2]` and
+    // next_row's `mem_channels[0]` which contains the next top of the stack.
+    // Our goal here is to range-check the input, in the next stack top.
+    let range_check_op = arithmetic::Operation::range_check(
+        state.registers.stack_top,
+        U256::from(0),
+        U256::from(0),
+        opcode,
+        input,
+    );
+
     push_with_write(state, &mut row, input)?;
+
+    state.traces.push_arithmetic(range_check_op);
     state.traces.push_cpu(row);
     Ok(())
 }
@@ -180,6 +191,17 @@ pub(crate) fn generate_pop<F: Field>(
 ) -> Result<(), ProgramError> {
     let [(_, _)] = stack_pop_with_log_and_fill::<1, _>(state, &mut row)?;
 
+    let diff = row.stack_len - F::ONE;
+    if let Some(inv) = diff.try_inverse() {
+        row.general.stack_mut().stack_inv = inv;
+        row.general.stack_mut().stack_inv_aux = F::ONE;
+        row.general.stack_mut().stack_inv_aux_2 = F::ONE;
+        state.registers.is_stack_top_read = true;
+    } else {
+        row.general.stack_mut().stack_inv = F::ZERO;
+        row.general.stack_mut().stack_inv_aux = F::ZERO;
+    }
+
     state.traces.push_cpu(row);
 
     Ok(())
@@ -318,7 +340,26 @@ pub(crate) fn generate_get_context<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
 ) -> Result<(), ProgramError> {
-    push_with_write(state, &mut row, state.registers.context.into())?;
+    // Same logic as push_with_write, but we have to use channel 3 for stack constraint reasons.
+    let write = if state.registers.stack_len == 0 {
+        None
+    } else {
+        let address = MemoryAddress::new(
+            state.registers.context,
+            Segment::Stack,
+            state.registers.stack_len - 1,
+        );
+        let res = mem_write_gp_log_and_fill(2, address, state, &mut row, state.registers.stack_top);
+        Some(res)
+    };
+    push_no_write(
+        state,
+        // The fetched value needs to be scaled before being pushed.
+        U256::from(state.registers.context) << CONTEXT_SCALING_FACTOR,
+    );
+    if let Some(log) = write {
+        state.traces.push_memory(log);
+    }
     state.traces.push_cpu(row);
     Ok(())
 }
@@ -332,13 +373,17 @@ pub(crate) fn generate_set_context<F: Field>(
     let sp_to_save = state.registers.stack_len.into();
 
     let old_ctx = state.registers.context;
-    let new_ctx = u256_to_usize(ctx)?;
+    // The popped value needs to be scaled down.
+    let new_ctx = u256_to_usize(ctx >> CONTEXT_SCALING_FACTOR)?;
 
-    let sp_field = ContextMetadata::StackSize as usize;
+    let sp_field = ContextMetadata::StackSize.unscale();
     let old_sp_addr = MemoryAddress::new(old_ctx, Segment::ContextMetadata, sp_field);
     let new_sp_addr = MemoryAddress::new(new_ctx, Segment::ContextMetadata, sp_field);
 
-    let log_write_old_sp = mem_write_gp_log_and_fill(1, old_sp_addr, state, &mut row, sp_to_save);
+    // This channel will hold in limb 0 and 1 the one-limb value of two separate memory operations:
+    // the old stack pointer write and the new stack pointer read.
+    // Channels only matter for time stamps: the write must happen before the read.
+    let log_write_old_sp = mem_write_log(GeneralPurpose(1), old_sp_addr, state, sp_to_save);
     let (new_sp, log_read_new_sp) = if old_ctx == new_ctx {
         let op = MemoryOp::new(
             MemoryChannel::GeneralPurpose(2),
@@ -347,23 +392,9 @@ pub(crate) fn generate_set_context<F: Field>(
             MemoryOpKind::Read,
             sp_to_save,
         );
-
-        let channel = &mut row.mem_channels[2];
-        assert_eq!(channel.used, F::ZERO);
-        channel.used = F::ONE;
-        channel.is_read = F::ONE;
-        channel.addr_context = F::from_canonical_usize(new_ctx);
-        channel.addr_segment = F::from_canonical_usize(Segment::ContextMetadata as usize);
-        channel.addr_virtual = F::from_canonical_usize(new_sp_addr.virt);
-        let val_limbs: [u64; 4] = sp_to_save.0;
-        for (i, limb) in val_limbs.into_iter().enumerate() {
-            channel.value[2 * i] = F::from_canonical_u32(limb as u32);
-            channel.value[2 * i + 1] = F::from_canonical_u32((limb >> 32) as u32);
-        }
-
         (sp_to_save, op)
     } else {
-        mem_read_gp_with_log_and_fill(2, new_sp_addr, state, &mut row)
+        mem_read_with_log(GeneralPurpose(2), new_sp_addr, state)
     };
 
     // If the new stack isn't empty, read stack_top from memory.
@@ -374,14 +405,16 @@ pub(crate) fn generate_set_context<F: Field>(
         if let Some(inv) = new_sp_field.try_inverse() {
             row.general.stack_mut().stack_inv = inv;
             row.general.stack_mut().stack_inv_aux = F::ONE;
+            row.general.stack_mut().stack_inv_aux_2 = F::ONE;
         } else {
             row.general.stack_mut().stack_inv = F::ZERO;
             row.general.stack_mut().stack_inv_aux = F::ZERO;
+            row.general.stack_mut().stack_inv_aux_2 = F::ZERO;
         }
 
         let new_top_addr = MemoryAddress::new(new_ctx, Segment::Stack, new_sp - 1);
         let (new_top, log_read_new_top) =
-            mem_read_gp_with_log_and_fill(3, new_top_addr, state, &mut row);
+            mem_read_gp_with_log_and_fill(2, new_top_addr, state, &mut row);
         state.registers.stack_top = new_top;
         state.traces.push_memory(log_read_new_top);
     } else {
@@ -394,6 +427,7 @@ pub(crate) fn generate_set_context<F: Field>(
     state.traces.push_memory(log_write_old_sp);
     state.traces.push_memory(log_read_new_sp);
     state.traces.push_cpu(row);
+
     Ok(())
 }
 
@@ -410,23 +444,26 @@ pub(crate) fn generate_push<F: Field>(
     }
     let initial_offset = state.registers.program_counter + 1;
 
+    let base_address = MemoryAddress::new(code_context, Segment::Code, initial_offset);
     // First read val without going through `mem_read_with_log` type methods, so we can pass it
     // to stack_push_log_and_fill.
     let bytes = (0..num_bytes)
         .map(|i| {
             state
                 .memory
-                .get(MemoryAddress::new(
-                    code_context,
-                    Segment::Code,
-                    initial_offset + i,
-                ))
+                .get(MemoryAddress {
+                    virt: base_address.virt + i,
+                    ..base_address
+                })
                 .low_u32() as u8
         })
         .collect_vec();
 
     let val = U256::from_big_endian(&bytes);
     push_with_write(state, &mut row, val)?;
+
+    byte_packing_log(state, base_address, bytes);
+
     state.traces.push_cpu(row);
 
     Ok(())
@@ -493,7 +530,7 @@ pub(crate) fn generate_dup<F: Field>(
     } else {
         mem_read_gp_with_log_and_fill(2, other_addr, state, &mut row)
     };
-    push_no_write(state, &mut row, val, None);
+    push_no_write(state, val);
 
     state.traces.push_memory(log_read);
     state.traces.push_cpu(row);
@@ -515,7 +552,7 @@ pub(crate) fn generate_swap<F: Field>(
     let [(in0, _)] = stack_pop_with_log_and_fill::<1, _>(state, &mut row)?;
     let (in1, log_in1) = mem_read_gp_with_log_and_fill(1, other_addr, state, &mut row);
     let log_out0 = mem_write_gp_log_and_fill(2, other_addr, state, &mut row, in0);
-    push_no_write(state, &mut row, in1, None);
+    push_no_write(state, in1);
 
     state.traces.push_memory(log_in1);
     state.traces.push_memory(log_out0);
@@ -529,7 +566,18 @@ pub(crate) fn generate_not<F: Field>(
 ) -> Result<(), ProgramError> {
     let [(x, _)] = stack_pop_with_log_and_fill::<1, _>(state, &mut row)?;
     let result = !x;
-    push_no_write(state, &mut row, result, Some(NUM_GP_CHANNELS - 1));
+    push_no_write(state, result);
+
+    // This is necessary for the stack constraints for POP,
+    // since the two flags are combined.
+    let diff = row.stack_len - F::ONE;
+    if let Some(inv) = diff.try_inverse() {
+        row.general.stack_mut().stack_inv = inv;
+        row.general.stack_mut().stack_inv_aux = F::ONE;
+    } else {
+        row.general.stack_mut().stack_inv = F::ZERO;
+        row.general.stack_mut().stack_inv_aux = F::ZERO;
+    }
 
     state.traces.push_cpu(row);
     Ok(())
@@ -548,7 +596,7 @@ pub(crate) fn generate_iszero<F: Field>(
 
     generate_pinv_diff(x, U256::zero(), &mut row);
 
-    push_no_write(state, &mut row, result, None);
+    push_no_write(state, result);
     state.traces.push_cpu(row);
     Ok(())
 }
@@ -587,7 +635,7 @@ fn append_shift<F: Field>(
     let operation = arithmetic::Operation::binary(operator, input0, input1);
 
     state.traces.push_arithmetic(operation);
-    push_no_write(state, &mut row, result, Some(NUM_GP_CHANNELS - 1));
+    push_no_write(state, result);
     state.traces.push_memory(log_in1);
     state.traces.push_cpu(row);
     Ok(())
@@ -628,7 +676,7 @@ pub(crate) fn generate_syscall<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
 ) -> Result<(), ProgramError> {
-    if TryInto::<u64>::try_into(state.registers.gas_used).is_err() {
+    if TryInto::<u32>::try_into(state.registers.gas_used).is_err() {
         return Err(ProgramError::GasLimitError);
     }
 
@@ -646,32 +694,50 @@ pub(crate) fn generate_syscall<F: Field>(
     let handler_addr_addr =
         handler_jumptable_addr + (opcode as usize) * (BYTES_PER_OFFSET as usize);
     assert_eq!(BYTES_PER_OFFSET, 3, "Code below assumes 3 bytes per offset");
-    let (handler_addr0, log_in0) = mem_read_gp_with_log_and_fill(
-        1,
-        MemoryAddress::new(0, Segment::Code, handler_addr_addr),
-        state,
-        &mut row,
-    );
-    let (handler_addr1, log_in1) = mem_read_gp_with_log_and_fill(
-        2,
-        MemoryAddress::new(0, Segment::Code, handler_addr_addr + 1),
-        state,
-        &mut row,
-    );
-    let (handler_addr2, log_in2) = mem_read_gp_with_log_and_fill(
-        3,
-        MemoryAddress::new(0, Segment::Code, handler_addr_addr + 2),
-        state,
-        &mut row,
-    );
+    let base_address = MemoryAddress::new(0, Segment::Code, handler_addr_addr);
+    let bytes = (0..BYTES_PER_OFFSET as usize)
+        .map(|i| {
+            let address = MemoryAddress {
+                virt: base_address.virt + i,
+                ..base_address
+            };
+            let val = state.memory.get(address);
+            val.low_u32() as u8
+        })
+        .collect_vec();
+
+    let packed_int = U256::from_big_endian(&bytes);
+
+    let jumptable_channel = &mut row.mem_channels[1];
+    jumptable_channel.is_read = F::ONE;
+    jumptable_channel.addr_context = F::ZERO;
+    jumptable_channel.addr_segment = F::from_canonical_usize(Segment::Code as usize);
+    jumptable_channel.addr_virtual = F::from_canonical_usize(handler_addr_addr);
+    jumptable_channel.value[0] = F::from_canonical_usize(u256_to_usize(packed_int)?);
+
+    byte_packing_log(state, base_address, bytes);
+
+    let new_program_counter = u256_to_usize(packed_int)?;
 
-    let handler_addr = (handler_addr0 << 16) + (handler_addr1 << 8) + handler_addr2;
-    let new_program_counter = u256_to_usize(handler_addr)?;
+    let gas = U256::from(state.registers.gas_used);
 
     let syscall_info = U256::from(state.registers.program_counter + 1)
         + (U256::from(u64::from(state.registers.is_kernel)) << 32)
-        + (U256::from(state.registers.gas_used) << 192);
-
+        + (gas << 192);
+
+    // `ArithmeticStark` range checks `mem_channels[0]`, which contains
+    // the top of the stack, `mem_channels[1]`, which contains the new PC,
+    // `mem_channels[2]`, which is empty, and next_row's `mem_channels[0]`,
+    // which contains the next top of the stack.
+    // Our goal here is to range-check the gas, contained in syscall_info,
+    // stored in the next stack top.
+    let range_check_op = arithmetic::Operation::range_check(
+        state.registers.stack_top,
+        packed_int,
+        U256::from(0),
+        U256::from(opcode),
+        syscall_info,
+    );
     // Set registers before pushing to the stack; in particular, we need to set kernel mode so we
     // can't incorrectly trigger a stack overflow. However, note that we have to do it _after_ we
     // make `syscall_info`, which should contain the old values.
@@ -683,9 +749,7 @@ pub(crate) fn generate_syscall<F: Field>(
 
     log::debug!("Syscall to {}", KERNEL.offset_name(new_program_counter));
 
-    state.traces.push_memory(log_in0);
-    state.traces.push_memory(log_in1);
-    state.traces.push_memory(log_in2);
+    state.traces.push_arithmetic(range_check_op);
     state.traces.push_cpu(row);
 
     Ok(())
@@ -701,7 +765,7 @@ pub(crate) fn generate_eq<F: Field>(
 
     generate_pinv_diff(in0, in1, &mut row);
 
-    push_no_write(state, &mut row, result, None);
+    push_no_write(state, result);
     state.traces.push_memory(log_in1);
     state.traces.push_cpu(row);
     Ok(())
@@ -718,7 +782,7 @@ pub(crate) fn generate_exit_kernel<F: Field>(
     assert!(is_kernel_mode_val == 0 || is_kernel_mode_val == 1);
     let is_kernel_mode = is_kernel_mode_val != 0;
     let gas_used_val = kexit_info.0[3];
-    if TryInto::<u64>::try_into(gas_used_val).is_err() {
+    if TryInto::<u32>::try_into(gas_used_val).is_err() {
         return Err(ProgramError::GasLimitError);
     }
 
@@ -740,18 +804,16 @@ pub(crate) fn generate_mload_general<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
 ) -> Result<(), ProgramError> {
-    let [(context, _), (segment, log_in1), (virt, log_in2)] =
-        stack_pop_with_log_and_fill::<3, _>(state, &mut row)?;
+    let [(addr, _)] = stack_pop_with_log_and_fill::<1, _>(state, &mut row)?;
 
-    let (val, log_read) = mem_read_gp_with_log_and_fill(
-        3,
-        MemoryAddress::new_u256s(context, segment, virt)?,
-        state,
-        &mut row,
-    );
-    push_no_write(state, &mut row, val, None);
+    let (val, log_read) =
+        mem_read_gp_with_log_and_fill(1, MemoryAddress::new_bundle(addr)?, state, &mut row);
+    push_no_write(state, val);
 
-    let diff = row.stack_len - F::from_canonical_usize(4);
+    // Because MLOAD_GENERAL performs 1 pop and 1 push, it does not make use of the `stack_inv_aux` general columns.
+    // We hence can set the diff to 2 (instead of 1) so that the stack constraint for MSTORE_GENERAL applies to both
+    // operations, which are combined into a single CPU flag.
+    let diff = row.stack_len - F::TWO;
     if let Some(inv) = diff.try_inverse() {
         row.general.stack_mut().stack_inv = inv;
         row.general.stack_mut().stack_inv_aux = F::ONE;
@@ -760,8 +822,6 @@ pub(crate) fn generate_mload_general<F: Field>(
         row.general.stack_mut().stack_inv_aux = F::ZERO;
     }
 
-    state.traces.push_memory(log_in1);
-    state.traces.push_memory(log_in2);
     state.traces.push_memory(log_read);
     state.traces.push_cpu(row);
     Ok(())
@@ -771,15 +831,14 @@ pub(crate) fn generate_mload_32bytes<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
 ) -> Result<(), ProgramError> {
-    let [(context, _), (segment, log_in1), (base_virt, log_in2), (len, log_in3)] =
-        stack_pop_with_log_and_fill::<4, _>(state, &mut row)?;
+    let [(addr, _), (len, log_in1)] = stack_pop_with_log_and_fill::<2, _>(state, &mut row)?;
     let len = u256_to_usize(len)?;
     if len > 32 {
         // The call to `U256::from_big_endian()` would panic.
         return Err(ProgramError::IntegerTooLarge);
     }
 
-    let base_address = MemoryAddress::new_u256s(context, segment, base_virt)?;
+    let base_address = MemoryAddress::new_bundle(addr)?;
     if usize::MAX - base_address.virt < len {
         return Err(ProgramError::MemoryError(VirtTooLarge {
             virt: base_address.virt.into(),
@@ -797,13 +856,11 @@ pub(crate) fn generate_mload_32bytes<F: Field>(
         .collect_vec();
 
     let packed_int = U256::from_big_endian(&bytes);
-    push_no_write(state, &mut row, packed_int, Some(4));
+    push_no_write(state, packed_int);
 
     byte_packing_log(state, base_address, bytes);
 
     state.traces.push_memory(log_in1);
-    state.traces.push_memory(log_in2);
-    state.traces.push_memory(log_in3);
     state.traces.push_cpu(row);
     Ok(())
 }
@@ -812,23 +869,12 @@ pub(crate) fn generate_mstore_general<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
 ) -> Result<(), ProgramError> {
-    let [(context, _), (segment, log_in1), (virt, log_in2), (val, log_in3)] =
-        stack_pop_with_log_and_fill::<4, _>(state, &mut row)?;
+    let [(val, _), (addr, log_in1)] = stack_pop_with_log_and_fill::<2, _>(state, &mut row)?;
 
-    let address = MemoryAddress {
-        context: context
-            .try_into()
-            .map_err(|_| MemoryError(ContextTooLarge { context }))?,
-        segment: segment
-            .try_into()
-            .map_err(|_| MemoryError(SegmentTooLarge { segment }))?,
-        virt: virt
-            .try_into()
-            .map_err(|_| MemoryError(VirtTooLarge { virt }))?,
-    };
-    let log_write = mem_write_gp_log_and_fill(4, address, state, &mut row, val);
+    let address = MemoryAddress::new_bundle(addr)?;
+    let log_write = mem_write_partial_log_and_fill(address, state, &mut row, val);
 
-    let diff = row.stack_len - F::from_canonical_usize(4);
+    let diff = row.stack_len - F::TWO;
     if let Some(inv) = diff.try_inverse() {
         row.general.stack_mut().stack_inv = inv;
         row.general.stack_mut().stack_inv_aux = F::ONE;
@@ -840,30 +886,28 @@ pub(crate) fn generate_mstore_general<F: Field>(
     }
 
     state.traces.push_memory(log_in1);
-    state.traces.push_memory(log_in2);
-    state.traces.push_memory(log_in3);
     state.traces.push_memory(log_write);
+
     state.traces.push_cpu(row);
 
     Ok(())
 }
 
 pub(crate) fn generate_mstore_32bytes<F: Field>(
+    n: u8,
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
 ) -> Result<(), ProgramError> {
-    let [(context, _), (segment, log_in1), (base_virt, log_in2), (val, log_in3), (len, log_in4)] =
-        stack_pop_with_log_and_fill::<5, _>(state, &mut row)?;
-    let len = u256_to_usize(len)?;
+    let [(addr, _), (val, log_in1)] = stack_pop_with_log_and_fill::<2, _>(state, &mut row)?;
+
+    let base_address = MemoryAddress::new_bundle(addr)?;
 
-    let base_address = MemoryAddress::new_u256s(context, segment, base_virt)?;
+    byte_unpacking_log(state, base_address, val, n as usize);
 
-    byte_unpacking_log(state, base_address, val, len);
+    let new_addr = addr + n;
+    push_no_write(state, new_addr);
 
     state.traces.push_memory(log_in1);
-    state.traces.push_memory(log_in2);
-    state.traces.push_memory(log_in3);
-    state.traces.push_memory(log_in4);
     state.traces.push_cpu(row);
     Ok(())
 }
@@ -873,50 +917,18 @@ pub(crate) fn generate_exception<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
 ) -> Result<(), ProgramError> {
-    if TryInto::<u64>::try_into(state.registers.gas_used).is_err() {
+    if TryInto::<u32>::try_into(state.registers.gas_used).is_err() {
         return Err(ProgramError::GasLimitError);
     }
 
     row.op.exception = F::ONE;
 
-    let disallowed_len = F::from_canonical_usize(MAX_USER_STACK_SIZE + 1);
-    let diff = row.stack_len - disallowed_len;
-    if let Some(inv) = diff.try_inverse() {
-        row.stack_len_bounds_aux = inv;
-    } else {
-        // This is a stack overflow that should have been caught earlier.
-        return Err(ProgramError::InterpreterError);
-    }
-
     if let Some(inv) = row.stack_len.try_inverse() {
         row.general.stack_mut().stack_inv = inv;
         row.general.stack_mut().stack_inv_aux = F::ONE;
     }
 
-    if state.registers.is_stack_top_read {
-        let channel = &mut row.mem_channels[0];
-        channel.used = F::ONE;
-        channel.is_read = F::ONE;
-        channel.addr_context = F::from_canonical_usize(state.registers.context);
-        channel.addr_segment = F::from_canonical_usize(Segment::Stack as usize);
-        channel.addr_virtual = F::from_canonical_usize(state.registers.stack_len - 1);
-
-        let address = MemoryAddress {
-            context: state.registers.context,
-            segment: Segment::Stack as usize,
-            virt: state.registers.stack_len - 1,
-        };
-
-        let mem_op = MemoryOp::new(
-            GeneralPurpose(0),
-            state.traces.clock(),
-            address,
-            MemoryOpKind::Read,
-            state.registers.stack_top,
-        );
-        state.traces.push_memory(mem_op);
-        state.registers.is_stack_top_read = false;
-    }
+    fill_stack_fields(state, &mut row)?;
 
     row.general.exception_mut().exc_code_bits = [
         F::from_bool(exc_code & 1 != 0),
@@ -928,31 +940,52 @@ pub(crate) fn generate_exception<F: Field>(
     let handler_addr_addr =
         handler_jumptable_addr + (exc_code as usize) * (BYTES_PER_OFFSET as usize);
     assert_eq!(BYTES_PER_OFFSET, 3, "Code below assumes 3 bytes per offset");
-    let (handler_addr0, log_in0) = mem_read_gp_with_log_and_fill(
-        1,
-        MemoryAddress::new(0, Segment::Code, handler_addr_addr),
-        state,
-        &mut row,
-    );
-    let (handler_addr1, log_in1) = mem_read_gp_with_log_and_fill(
-        2,
-        MemoryAddress::new(0, Segment::Code, handler_addr_addr + 1),
-        state,
-        &mut row,
-    );
-    let (handler_addr2, log_in2) = mem_read_gp_with_log_and_fill(
-        3,
-        MemoryAddress::new(0, Segment::Code, handler_addr_addr + 2),
-        state,
-        &mut row,
-    );
+    let base_address = MemoryAddress::new(0, Segment::Code, handler_addr_addr);
+    let bytes = (0..BYTES_PER_OFFSET as usize)
+        .map(|i| {
+            let address = MemoryAddress {
+                virt: base_address.virt + i,
+                ..base_address
+            };
+            let val = state.memory.get(address);
+            val.low_u32() as u8
+        })
+        .collect_vec();
 
-    let handler_addr = (handler_addr0 << 16) + (handler_addr1 << 8) + handler_addr2;
-    let new_program_counter = u256_to_usize(handler_addr)?;
+    let packed_int = U256::from_big_endian(&bytes);
 
-    let exc_info =
-        U256::from(state.registers.program_counter) + (U256::from(state.registers.gas_used) << 192);
+    let jumptable_channel = &mut row.mem_channels[1];
+    jumptable_channel.is_read = F::ONE;
+    jumptable_channel.addr_context = F::ZERO;
+    jumptable_channel.addr_segment = F::from_canonical_usize(Segment::Code as usize);
+    jumptable_channel.addr_virtual = F::from_canonical_usize(handler_addr_addr);
+    jumptable_channel.value[0] = F::from_canonical_usize(u256_to_usize(packed_int)?);
 
+    byte_packing_log(state, base_address, bytes);
+    let new_program_counter = u256_to_usize(packed_int)?;
+
+    let gas = U256::from(state.registers.gas_used);
+
+    let exc_info = U256::from(state.registers.program_counter) + (gas << 192);
+
+    // Get the opcode so we can provide it to the range_check operation.
+    let code_context = state.registers.code_context();
+    let address = MemoryAddress::new(code_context, Segment::Code, state.registers.program_counter);
+    let opcode = state.memory.get(address);
+
+    // `ArithmeticStark` range checks `mem_channels[0]`, which contains
+    // the top of the stack, `mem_channels[1]`, which contains the new PC,
+    // `mem_channels[2]`, which is empty, and next_row's `mem_channels[0]`,
+    // which contains the next top of the stack.
+    // Our goal here is to range-check the gas, contained in syscall_info,
+    // stored in the next stack top.
+    let range_check_op = arithmetic::Operation::range_check(
+        state.registers.stack_top,
+        packed_int,
+        U256::from(0),
+        opcode,
+        exc_info,
+    );
     // Set registers before pushing to the stack; in particular, we need to set kernel mode so we
     // can't incorrectly trigger a stack overflow. However, note that we have to do it _after_ we
     // make `exc_info`, which should contain the old values.
@@ -963,10 +996,7 @@ pub(crate) fn generate_exception<F: Field>(
     push_with_write(state, &mut row, exc_info)?;
 
     log::debug!("Exception to {}", KERNEL.offset_name(new_program_counter));
-
-    state.traces.push_memory(log_in0);
-    state.traces.push_memory(log_in1);
-    state.traces.push_memory(log_in2);
+    state.traces.push_arithmetic(range_check_op);
     state.traces.push_cpu(row);
 
     Ok(())
diff --git a/evm/src/witness/state.rs b/evm/src/witness/state.rs
index 406ae8567f..1070ee6439 100644
--- a/evm/src/witness/state.rs
+++ b/evm/src/witness/state.rs
@@ -12,12 +12,15 @@ pub struct RegistersState {
     pub stack_top: U256,
     // Indicates if you read the new stack_top from memory to set the channel accordingly.
     pub is_stack_top_read: bool,
+    // Indicates if the previous operation might have caused an overflow, and we must check
+    // if it's the case.
+    pub check_overflow: bool,
     pub context: usize,
     pub gas_used: u64,
 }
 
 impl RegistersState {
-    pub(crate) fn code_context(&self) -> usize {
+    pub(crate) const fn code_context(&self) -> usize {
         if self.is_kernel {
             KERNEL_CONTEXT
         } else {
@@ -34,6 +37,7 @@ impl Default for RegistersState {
             stack_len: 0,
             stack_top: U256::zero(),
             is_stack_top_read: false,
+            check_overflow: false,
             context: 0,
             gas_used: 0,
         }
diff --git a/evm/src/witness/traces.rs b/evm/src/witness/traces.rs
index 91035fc403..f7f5c9d365 100644
--- a/evm/src/witness/traces.rs
+++ b/evm/src/witness/traces.rs
@@ -1,4 +1,4 @@
-use std::mem::size_of;
+use core::mem::size_of;
 
 use itertools::Itertools;
 use plonky2::field::extension::Extendable;
@@ -19,7 +19,7 @@ use crate::witness::memory::MemoryOp;
 use crate::{arithmetic, keccak, keccak_sponge, logic};
 
 #[derive(Clone, Copy, Debug)]
-pub struct TraceCheckpoint {
+pub(crate) struct TraceCheckpoint {
     pub(self) arithmetic_len: usize,
     pub(self) byte_packing_len: usize,
     pub(self) cpu_len: usize,
@@ -41,7 +41,7 @@ pub(crate) struct Traces<T: Copy> {
 }
 
 impl<T: Copy> Traces<T> {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
         Traces {
             arithmetic_ops: vec![],
             byte_packing_ops: vec![],
@@ -55,7 +55,7 @@ impl<T: Copy> Traces<T> {
 
     /// Returns the actual trace lengths for each STARK module.
     //  Uses a `TraceCheckPoint` as return object for convenience.
-    pub fn get_lengths(&self) -> TraceCheckpoint {
+    pub(crate) fn get_lengths(&self) -> TraceCheckpoint {
         TraceCheckpoint {
             arithmetic_len: self
                 .arithmetic_ops
@@ -66,9 +66,14 @@ impl<T: Copy> Traces<T> {
                         BinaryOperator::Div | BinaryOperator::Mod => 2,
                         _ => 1,
                     },
+                    Operation::RangeCheckOperation { .. } => 1,
                 })
                 .sum(),
-            byte_packing_len: self.byte_packing_ops.iter().map(|op| op.bytes.len()).sum(),
+            byte_packing_len: self
+                .byte_packing_ops
+                .iter()
+                .map(|op| usize::from(!op.bytes.is_empty()))
+                .sum(),
             cpu_len: self.cpu.len(),
             keccak_len: self.keccak_inputs.len() * keccak::keccak_stark::NUM_ROUNDS,
             keccak_sponge_len: self
@@ -84,7 +89,7 @@ impl<T: Copy> Traces<T> {
     }
 
     /// Returns the number of operations for each STARK module.
-    pub fn checkpoint(&self) -> TraceCheckpoint {
+    pub(crate) fn checkpoint(&self) -> TraceCheckpoint {
         TraceCheckpoint {
             arithmetic_len: self.arithmetic_ops.len(),
             byte_packing_len: self.byte_packing_ops.len(),
@@ -96,7 +101,7 @@ impl<T: Copy> Traces<T> {
         }
     }
 
-    pub fn rollback(&mut self, checkpoint: TraceCheckpoint) {
+    pub(crate) fn rollback(&mut self, checkpoint: TraceCheckpoint) {
         self.arithmetic_ops.truncate(checkpoint.arithmetic_len);
         self.byte_packing_ops.truncate(checkpoint.byte_packing_len);
         self.cpu.truncate(checkpoint.cpu_len);
@@ -107,35 +112,39 @@ impl<T: Copy> Traces<T> {
         self.memory_ops.truncate(checkpoint.memory_len);
     }
 
-    pub fn mem_ops_since(&self, checkpoint: TraceCheckpoint) -> &[MemoryOp] {
+    pub(crate) fn mem_ops_since(&self, checkpoint: TraceCheckpoint) -> &[MemoryOp] {
         &self.memory_ops[checkpoint.memory_len..]
     }
 
-    pub fn push_cpu(&mut self, val: CpuColumnsView<T>) {
+    pub(crate) fn push_cpu(&mut self, val: CpuColumnsView<T>) {
         self.cpu.push(val);
     }
 
-    pub fn push_logic(&mut self, op: logic::Operation) {
+    pub(crate) fn push_logic(&mut self, op: logic::Operation) {
         self.logic_ops.push(op);
     }
 
-    pub fn push_arithmetic(&mut self, op: arithmetic::Operation) {
+    pub(crate) fn push_arithmetic(&mut self, op: arithmetic::Operation) {
         self.arithmetic_ops.push(op);
     }
 
-    pub fn push_memory(&mut self, op: MemoryOp) {
+    pub(crate) fn push_memory(&mut self, op: MemoryOp) {
         self.memory_ops.push(op);
     }
 
-    pub fn push_byte_packing(&mut self, op: BytePackingOp) {
+    pub(crate) fn push_byte_packing(&mut self, op: BytePackingOp) {
         self.byte_packing_ops.push(op);
     }
 
-    pub fn push_keccak(&mut self, input: [u64; keccak::keccak_stark::NUM_INPUTS], clock: usize) {
+    pub(crate) fn push_keccak(
+        &mut self,
+        input: [u64; keccak::keccak_stark::NUM_INPUTS],
+        clock: usize,
+    ) {
         self.keccak_inputs.push((input, clock));
     }
 
-    pub fn push_keccak_bytes(&mut self, input: [u8; KECCAK_WIDTH_BYTES], clock: usize) {
+    pub(crate) fn push_keccak_bytes(&mut self, input: [u8; KECCAK_WIDTH_BYTES], clock: usize) {
         let chunks = input
             .chunks(size_of::<u64>())
             .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap()))
@@ -145,15 +154,15 @@ impl<T: Copy> Traces<T> {
         self.push_keccak(chunks, clock);
     }
 
-    pub fn push_keccak_sponge(&mut self, op: KeccakSpongeOp) {
+    pub(crate) fn push_keccak_sponge(&mut self, op: KeccakSpongeOp) {
         self.keccak_sponge_ops.push(op);
     }
 
-    pub fn clock(&self) -> usize {
+    pub(crate) fn clock(&self) -> usize {
         self.cpu.len()
     }
 
-    pub fn into_tables<const D: usize>(
+    pub(crate) fn into_tables<const D: usize>(
         self,
         all_stark: &AllStark<T, D>,
         config: &StarkConfig,
diff --git a/evm/src/witness/transition.rs b/evm/src/witness/transition.rs
index abc398e644..b26a133964 100644
--- a/evm/src/witness/transition.rs
+++ b/evm/src/witness/transition.rs
@@ -6,10 +6,11 @@ use super::memory::{MemoryOp, MemoryOpKind};
 use super::util::fill_channel_with_value;
 use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::kernel::aggregator::KERNEL;
+use crate::cpu::kernel::constants::context_metadata::ContextMetadata;
 use crate::cpu::stack::{
-    EQ_STACK_BEHAVIOR, IS_ZERO_STACK_BEHAVIOR, JUMPI_OP, JUMP_OP, STACK_BEHAVIORS,
+    EQ_STACK_BEHAVIOR, IS_ZERO_STACK_BEHAVIOR, JUMPI_OP, JUMP_OP, MAX_USER_STACK_SIZE,
+    MIGHT_OVERFLOW, STACK_BEHAVIORS,
 };
-use crate::cpu::stack_bounds::MAX_USER_STACK_SIZE;
 use crate::generation::state::GenerationState;
 use crate::memory::segments::Segment;
 use crate::witness::errors::ProgramError;
@@ -33,7 +34,7 @@ fn read_code_memory<F: Field>(state: &mut GenerationState<F>, row: &mut CpuColum
     opcode
 }
 
-fn decode(registers: RegistersState, opcode: u8) -> Result<Operation, ProgramError> {
+pub(crate) fn decode(registers: RegistersState, opcode: u8) -> Result<Operation, ProgramError> {
     match (opcode, registers.is_kernel) {
         (0x00, _) => Ok(Operation::Syscall(opcode, 0, false)), // STOP
         (0x01, _) => Ok(Operation::BinaryArithmetic(arithmetic::BinaryOperator::Add)),
@@ -136,7 +137,7 @@ fn decode(registers: RegistersState, opcode: u8) -> Result<Operation, ProgramErr
             );
             Err(ProgramError::KernelPanic)
         }
-        (0xee, true) => Ok(Operation::Mstore32Bytes),
+        (0xc0..=0xdf, true) => Ok(Operation::Mstore32Bytes(opcode - 0xc0 + 1)),
         (0xf0, _) => Ok(Operation::Syscall(opcode, 3, false)), // CREATE
         (0xf1, _) => Ok(Operation::Syscall(opcode, 7, false)), // CALL
         (0xf2, _) => Ok(Operation::Syscall(opcode, 7, false)), // CALLCODE
@@ -162,11 +163,9 @@ fn decode(registers: RegistersState, opcode: u8) -> Result<Operation, ProgramErr
 fn fill_op_flag<F: Field>(op: Operation, row: &mut CpuColumnsView<F>) {
     let flags = &mut row.op;
     *match op {
-        Operation::Push(0) => &mut flags.push0,
-        Operation::Push(1..) => &mut flags.push,
         Operation::Dup(_) | Operation::Swap(_) => &mut flags.dup_swap,
         Operation::Iszero | Operation::Eq => &mut flags.eq_iszero,
-        Operation::Not => &mut flags.not,
+        Operation::Not | Operation::Pop => &mut flags.not_pop,
         Operation::Syscall(_, _, _) => &mut flags.syscall,
         Operation::BinaryLogic(_) => &mut flags.logic_op,
         Operation::BinaryArithmetic(arithmetic::BinaryOperator::AddFp254)
@@ -176,29 +175,25 @@ fn fill_op_flag<F: Field>(op: Operation, row: &mut CpuColumnsView<F>) {
         | Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr) => &mut flags.shift,
         Operation::BinaryArithmetic(_) => &mut flags.binary_op,
         Operation::TernaryArithmetic(_) => &mut flags.ternary_op,
-        Operation::KeccakGeneral => &mut flags.keccak_general,
-        Operation::ProverInput => &mut flags.prover_input,
-        Operation::Pop => &mut flags.pop,
+        Operation::KeccakGeneral | Operation::Jumpdest => &mut flags.jumpdest_keccak_general,
+        Operation::ProverInput | Operation::Push(1..) => &mut flags.push_prover_input,
         Operation::Jump | Operation::Jumpi => &mut flags.jumps,
-        Operation::Pc => &mut flags.pc,
-        Operation::Jumpdest => &mut flags.jumpdest,
-        Operation::GetContext => &mut flags.get_context,
-        Operation::SetContext => &mut flags.set_context,
-        Operation::Mload32Bytes => &mut flags.mload_32bytes,
-        Operation::Mstore32Bytes => &mut flags.mstore_32bytes,
+        Operation::Pc | Operation::Push(0) => &mut flags.pc_push0,
+        Operation::GetContext | Operation::SetContext => &mut flags.context_op,
+        Operation::Mload32Bytes | Operation::Mstore32Bytes(_) => &mut flags.m_op_32bytes,
         Operation::ExitKernel => &mut flags.exit_kernel,
         Operation::MloadGeneral | Operation::MstoreGeneral => &mut flags.m_op_general,
     } = F::ONE;
 }
 
 // Equal to the number of pops if an operation pops without pushing, and `None` otherwise.
-fn get_op_special_length(op: Operation) -> Option<usize> {
+const fn get_op_special_length(op: Operation) -> Option<usize> {
     let behavior_opt = match op {
-        Operation::Push(0) => STACK_BEHAVIORS.push0,
-        Operation::Push(1..) => STACK_BEHAVIORS.push,
+        Operation::Push(0) | Operation::Pc => STACK_BEHAVIORS.pc_push0,
+        Operation::Push(1..) | Operation::ProverInput => STACK_BEHAVIORS.push_prover_input,
         Operation::Dup(_) | Operation::Swap(_) => STACK_BEHAVIORS.dup_swap,
         Operation::Iszero => IS_ZERO_STACK_BEHAVIOR,
-        Operation::Not => STACK_BEHAVIORS.not,
+        Operation::Not | Operation::Pop => STACK_BEHAVIORS.not_pop,
         Operation::Syscall(_, _, _) => STACK_BEHAVIORS.syscall,
         Operation::Eq => EQ_STACK_BEHAVIOR,
         Operation::BinaryLogic(_) => STACK_BEHAVIORS.logic_op,
@@ -211,17 +206,11 @@ fn get_op_special_length(op: Operation) -> Option<usize> {
         | Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr) => STACK_BEHAVIORS.shift,
         Operation::BinaryArithmetic(_) => STACK_BEHAVIORS.binary_op,
         Operation::TernaryArithmetic(_) => STACK_BEHAVIORS.ternary_op,
-        Operation::KeccakGeneral => STACK_BEHAVIORS.keccak_general,
-        Operation::ProverInput => STACK_BEHAVIORS.prover_input,
-        Operation::Pop => STACK_BEHAVIORS.pop,
+        Operation::KeccakGeneral | Operation::Jumpdest => STACK_BEHAVIORS.jumpdest_keccak_general,
         Operation::Jump => JUMP_OP,
         Operation::Jumpi => JUMPI_OP,
-        Operation::Pc => STACK_BEHAVIORS.pc,
-        Operation::Jumpdest => STACK_BEHAVIORS.jumpdest,
-        Operation::GetContext => STACK_BEHAVIORS.get_context,
-        Operation::SetContext => None,
-        Operation::Mload32Bytes => STACK_BEHAVIORS.mload_32bytes,
-        Operation::Mstore32Bytes => STACK_BEHAVIORS.mstore_32bytes,
+        Operation::GetContext | Operation::SetContext => None,
+        Operation::Mload32Bytes | Operation::Mstore32Bytes(_) => STACK_BEHAVIORS.m_op_32bytes,
         Operation::ExitKernel => STACK_BEHAVIORS.exit_kernel,
         Operation::MloadGeneral | Operation::MstoreGeneral => STACK_BEHAVIORS.m_op_general,
     };
@@ -236,11 +225,40 @@ fn get_op_special_length(op: Operation) -> Option<usize> {
     }
 }
 
+// These operations might trigger a stack overflow, typically those pushing without popping.
+// Kernel-only pushing instructions aren't considered; they can't overflow.
+const fn might_overflow_op(op: Operation) -> bool {
+    match op {
+        Operation::Push(1..) | Operation::ProverInput => MIGHT_OVERFLOW.push_prover_input,
+        Operation::Dup(_) | Operation::Swap(_) => MIGHT_OVERFLOW.dup_swap,
+        Operation::Iszero | Operation::Eq => MIGHT_OVERFLOW.eq_iszero,
+        Operation::Not | Operation::Pop => MIGHT_OVERFLOW.not_pop,
+        Operation::Syscall(_, _, _) => MIGHT_OVERFLOW.syscall,
+        Operation::BinaryLogic(_) => MIGHT_OVERFLOW.logic_op,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::AddFp254)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::MulFp254)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::SubFp254) => {
+            MIGHT_OVERFLOW.fp254_op
+        }
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shl)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr) => MIGHT_OVERFLOW.shift,
+        Operation::BinaryArithmetic(_) => MIGHT_OVERFLOW.binary_op,
+        Operation::TernaryArithmetic(_) => MIGHT_OVERFLOW.ternary_op,
+        Operation::KeccakGeneral | Operation::Jumpdest => MIGHT_OVERFLOW.jumpdest_keccak_general,
+        Operation::Jump | Operation::Jumpi => MIGHT_OVERFLOW.jumps,
+        Operation::Pc | Operation::Push(0) => MIGHT_OVERFLOW.pc_push0,
+        Operation::GetContext | Operation::SetContext => MIGHT_OVERFLOW.context_op,
+        Operation::Mload32Bytes | Operation::Mstore32Bytes(_) => MIGHT_OVERFLOW.m_op_32bytes,
+        Operation::ExitKernel => MIGHT_OVERFLOW.exit_kernel,
+        Operation::MloadGeneral | Operation::MstoreGeneral => MIGHT_OVERFLOW.m_op_general,
+    }
+}
+
 fn perform_op<F: Field>(
     state: &mut GenerationState<F>,
     op: Operation,
     row: CpuColumnsView<F>,
-) -> Result<(), ProgramError> {
+) -> Result<Operation, ProgramError> {
     match op {
         Operation::Push(n) => generate_push(n, state, row)?,
         Operation::Dup(n) => generate_dup(n, state, row)?,
@@ -268,7 +286,7 @@ fn perform_op<F: Field>(
         Operation::GetContext => generate_get_context(state, row)?,
         Operation::SetContext => generate_set_context(state, row)?,
         Operation::Mload32Bytes => generate_mload_32bytes(state, row)?,
-        Operation::Mstore32Bytes => generate_mstore_32bytes(state, row)?,
+        Operation::Mstore32Bytes(n) => generate_mstore_32bytes(n, state, row)?,
         Operation::ExitKernel => generate_exit_kernel(state, row)?,
         Operation::MloadGeneral => generate_mload_general(state, row)?,
         Operation::MstoreGeneral => generate_mstore_general(state, row)?,
@@ -283,7 +301,24 @@ fn perform_op<F: Field>(
 
     state.registers.gas_used += gas_to_charge(op);
 
-    Ok(())
+    let gas_limit_address = MemoryAddress::new(
+        state.registers.context,
+        Segment::ContextMetadata,
+        ContextMetadata::GasLimit.unscale(), // context offsets are already scaled
+    );
+    if !state.registers.is_kernel {
+        let gas_limit = TryInto::<u64>::try_into(state.memory.get(gas_limit_address));
+        match gas_limit {
+            Ok(limit) => {
+                if state.registers.gas_used > limit {
+                    return Err(ProgramError::OutOfGas);
+                }
+            }
+            Err(_) => return Err(ProgramError::IntegerTooLarge),
+        }
+    }
+
+    Ok(op)
 }
 
 /// Row that has the correct values for system registers and the code channel, but is otherwise
@@ -295,10 +330,7 @@ fn base_row<F: Field>(state: &mut GenerationState<F>) -> (CpuColumnsView<F>, u8)
     row.context = F::from_canonical_usize(state.registers.context);
     row.program_counter = F::from_canonical_usize(state.registers.program_counter);
     row.is_kernel_mode = F::from_bool(state.registers.is_kernel);
-    row.gas = [
-        F::from_canonical_u32(state.registers.gas_used as u32),
-        F::from_canonical_u32((state.registers.gas_used >> 32) as u32),
-    ];
+    row.gas = F::from_canonical_u64(state.registers.gas_used);
     row.stack_len = F::from_canonical_usize(state.registers.stack_len);
     fill_channel_with_value(&mut row, 0, state.registers.stack_top);
 
@@ -306,31 +338,23 @@ fn base_row<F: Field>(state: &mut GenerationState<F>) -> (CpuColumnsView<F>, u8)
     (row, opcode)
 }
 
-fn try_perform_instruction<F: Field>(state: &mut GenerationState<F>) -> Result<(), ProgramError> {
-    let (mut row, opcode) = base_row(state);
-    let op = decode(state.registers, opcode)?;
-
-    if state.registers.is_kernel {
-        log_kernel_instruction(state, op);
-    } else {
-        log::debug!("User instruction: {:?}", op);
-    }
-
-    fill_op_flag(op, &mut row);
-
+pub(crate) fn fill_stack_fields<F: Field>(
+    state: &mut GenerationState<F>,
+    row: &mut CpuColumnsView<F>,
+) -> Result<(), ProgramError> {
     if state.registers.is_stack_top_read {
         let channel = &mut row.mem_channels[0];
         channel.used = F::ONE;
         channel.is_read = F::ONE;
         channel.addr_context = F::from_canonical_usize(state.registers.context);
-        channel.addr_segment = F::from_canonical_usize(Segment::Stack as usize);
+        channel.addr_segment = F::from_canonical_usize(Segment::Stack.unscale());
         channel.addr_virtual = F::from_canonical_usize(state.registers.stack_len - 1);
 
-        let address = MemoryAddress {
-            context: state.registers.context,
-            segment: Segment::Stack as usize,
-            virt: state.registers.stack_len - 1,
-        };
+        let address = MemoryAddress::new(
+            state.registers.context,
+            Segment::Stack,
+            state.registers.stack_len - 1,
+        );
 
         let mem_op = MemoryOp::new(
             GeneralPurpose(0),
@@ -343,19 +367,43 @@ fn try_perform_instruction<F: Field>(state: &mut GenerationState<F>) -> Result<(
         state.registers.is_stack_top_read = false;
     }
 
-    if state.registers.is_kernel {
-        row.stack_len_bounds_aux = F::ZERO;
-    } else {
-        let disallowed_len = F::from_canonical_usize(MAX_USER_STACK_SIZE + 1);
-        let diff = row.stack_len - disallowed_len;
-        if let Some(inv) = diff.try_inverse() {
-            row.stack_len_bounds_aux = inv;
+    if state.registers.check_overflow {
+        if state.registers.is_kernel {
+            row.general.stack_mut().stack_len_bounds_aux = F::ZERO;
         } else {
-            // This is a stack overflow that should have been caught earlier.
-            return Err(ProgramError::InterpreterError);
+            let clock = state.traces.clock();
+            let last_row = &mut state.traces.cpu[clock - 1];
+            let disallowed_len = F::from_canonical_usize(MAX_USER_STACK_SIZE + 1);
+            let diff = row.stack_len - disallowed_len;
+            if let Some(inv) = diff.try_inverse() {
+                last_row.general.stack_mut().stack_len_bounds_aux = inv;
+            } else {
+                // This is a stack overflow that should have been caught earlier.
+                return Err(ProgramError::InterpreterError);
+            }
         }
+        state.registers.check_overflow = false;
+    }
+
+    Ok(())
+}
+
+fn try_perform_instruction<F: Field>(
+    state: &mut GenerationState<F>,
+) -> Result<Operation, ProgramError> {
+    let (mut row, opcode) = base_row(state);
+    let op = decode(state.registers, opcode)?;
+
+    if state.registers.is_kernel {
+        log_kernel_instruction(state, op);
+    } else {
+        log::debug!("User instruction: {:?}", op);
     }
 
+    fill_op_flag(op, &mut row);
+
+    fill_stack_fields(state, &mut row)?;
+
     // Might write in general CPU columns when it shouldn't, but the correct values will
     // overwrite these ones during the op generation.
     if let Some(special_len) = get_op_special_length(op) {
@@ -431,10 +479,13 @@ pub(crate) fn transition<F: Field>(state: &mut GenerationState<F>) -> anyhow::Re
     let result = try_perform_instruction(state);
 
     match result {
-        Ok(()) => {
+        Ok(op) => {
             state
                 .memory
                 .apply_ops(state.traces.mem_ops_since(checkpoint.traces));
+            if might_overflow_op(op) {
+                state.registers.check_overflow = true;
+            }
             Ok(())
         }
         Err(e) => {
@@ -445,7 +496,7 @@ pub(crate) fn transition<F: Field>(state: &mut GenerationState<F>) -> anyhow::Re
                     e,
                     offset_name,
                     state.stack(),
-                    state.memory.contexts[0].segments[Segment::KernelGeneral as usize].content,
+                    state.memory.contexts[0].segments[Segment::KernelGeneral.unscale()].content,
                 );
             }
             state.rollback(checkpoint);
diff --git a/evm/src/witness/util.rs b/evm/src/witness/util.rs
index 249703614b..5f39809392 100644
--- a/evm/src/witness/util.rs
+++ b/evm/src/witness/util.rs
@@ -5,8 +5,8 @@ use super::memory::DUMMY_MEMOP;
 use crate::byte_packing::byte_packing_stark::BytePackingOp;
 use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::kernel::keccak_util::keccakf_u8s;
-use crate::cpu::membus::{NUM_CHANNELS, NUM_GP_CHANNELS};
-use crate::cpu::stack_bounds::MAX_USER_STACK_SIZE;
+use crate::cpu::membus::NUM_CHANNELS;
+use crate::cpu::stack::MAX_USER_STACK_SIZE;
 use crate::generation::state::GenerationState;
 use crate::keccak_sponge::columns::{KECCAK_RATE_BYTES, KECCAK_WIDTH_BYTES};
 use crate::keccak_sponge::keccak_sponge_stark::KeccakSpongeOp;
@@ -68,31 +68,9 @@ pub(crate) fn fill_channel_with_value<F: Field>(row: &mut CpuColumnsView<F>, n:
 }
 
 /// Pushes without writing in memory. This happens in opcodes where a push immediately follows a pop.
-/// The pushed value may be loaded in a memory channel, without creating a memory operation.
-pub(crate) fn push_no_write<F: Field>(
-    state: &mut GenerationState<F>,
-    row: &mut CpuColumnsView<F>,
-    val: U256,
-    channel_opt: Option<usize>,
-) {
+pub(crate) fn push_no_write<F: Field>(state: &mut GenerationState<F>, val: U256) {
     state.registers.stack_top = val;
     state.registers.stack_len += 1;
-
-    if let Some(channel) = channel_opt {
-        let val_limbs: [u64; 4] = val.0;
-
-        let channel = &mut row.mem_channels[channel];
-        assert_eq!(channel.used, F::ZERO);
-        channel.used = F::ZERO;
-        channel.is_read = F::ZERO;
-        channel.addr_context = F::from_canonical_usize(0);
-        channel.addr_segment = F::from_canonical_usize(0);
-        channel.addr_virtual = F::from_canonical_usize(0);
-        for (i, limb) in val_limbs.into_iter().enumerate() {
-            channel.value[2 * i] = F::from_canonical_u32(limb as u32);
-            channel.value[2 * i + 1] = F::from_canonical_u32((limb >> 32) as u32);
-        }
-    }
 }
 
 /// Pushes and (maybe) writes the previous stack top in memory. This happens in opcodes which only push.
@@ -113,18 +91,13 @@ pub(crate) fn push_with_write<F: Field>(
             Segment::Stack,
             state.registers.stack_len - 1,
         );
-        let res = mem_write_gp_log_and_fill(
-            NUM_GP_CHANNELS - 1,
-            address,
-            state,
-            row,
-            state.registers.stack_top,
-        );
+        let res = mem_write_partial_log_and_fill(address, state, row, state.registers.stack_top);
         Some(res)
     };
-    push_no_write(state, row, val, None);
+    push_no_write(state, val);
     if let Some(log) = write {
         state.traces.push_memory(log);
+        row.partial_channel.used = F::ONE;
     }
     Ok(())
 }
@@ -222,6 +195,25 @@ pub(crate) fn mem_write_gp_log_and_fill<F: Field>(
     op
 }
 
+pub(crate) fn mem_write_partial_log_and_fill<F: Field>(
+    address: MemoryAddress,
+    state: &GenerationState<F>,
+    row: &mut CpuColumnsView<F>,
+    val: U256,
+) -> MemoryOp {
+    let op = mem_write_log(MemoryChannel::PartialChannel, address, state, val);
+
+    let channel = &mut row.partial_channel;
+    assert!(channel.used.is_zero());
+    channel.used = F::ONE;
+    channel.is_read = F::ZERO;
+    channel.addr_context = F::from_canonical_usize(address.context);
+    channel.addr_segment = F::from_canonical_usize(address.segment);
+    channel.addr_virtual = F::from_canonical_usize(address.virt);
+
+    op
+}
+
 // Channel 0 already contains the top of the stack. You only need to read
 // from the second popped element.
 // If the resulting stack isn't empty, update `stack_top`.
diff --git a/evm/tests/add11_yml.rs b/evm/tests/add11_yml.rs
index 91db589358..d68c531e2b 100644
--- a/evm/tests/add11_yml.rs
+++ b/evm/tests/add11_yml.rs
@@ -152,26 +152,24 @@ fn add11_yml() -> anyhow::Result<()> {
         receipts_root: receipts_trie.hash(),
     };
     let inputs = GenerationInputs {
-        signed_txns: vec![txn.to_vec()],
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
         tries: tries_before,
         trie_roots_after,
         contract_code,
         block_metadata,
-        genesis_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
         txn_number_before: 0.into(),
         gas_used_before: 0.into(),
         gas_used_after: 0xa868u64.into(),
-        block_bloom_before: [0.into(); 8],
-        block_bloom_after: [0.into(); 8],
         block_hashes: BlockHashes {
             prev_hashes: vec![H256::default(); 256],
             cur_hash: H256::default(),
         },
-        addresses: vec![],
     };
 
     let mut timing = TimingTree::new("prove", log::Level::Debug);
-    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing)?;
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
     timing.filter(Duration::from_millis(100)).print();
 
     verify_proof(&all_stark, proof, &config)
diff --git a/evm/tests/basic_smart_contract.rs b/evm/tests/basic_smart_contract.rs
index dcfd2b1bf9..c8295b3757 100644
--- a/evm/tests/basic_smart_contract.rs
+++ b/evm/tests/basic_smart_contract.rs
@@ -184,26 +184,24 @@ fn test_basic_smart_contract() -> anyhow::Result<()> {
         receipts_root: receipts_trie.hash(),
     };
     let inputs = GenerationInputs {
-        signed_txns: vec![txn.to_vec()],
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
         tries: tries_before,
         trie_roots_after,
         contract_code,
-        genesis_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
         block_metadata,
         txn_number_before: 0.into(),
         gas_used_before: 0.into(),
         gas_used_after: gas_used.into(),
-        block_bloom_before: [0.into(); 8],
-        block_bloom_after: [0.into(); 8],
         block_hashes: BlockHashes {
             prev_hashes: vec![H256::default(); 256],
             cur_hash: H256::default(),
         },
-        addresses: vec![],
     };
 
     let mut timing = TimingTree::new("prove", log::Level::Debug);
-    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing)?;
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
     timing.filter(Duration::from_millis(100)).print();
 
     verify_proof(&all_stark, proof, &config)
diff --git a/evm/tests/empty_txn_list.rs b/evm/tests/empty_txn_list.rs
index dd4e624b04..15416c8c8d 100644
--- a/evm/tests/empty_txn_list.rs
+++ b/evm/tests/empty_txn_list.rs
@@ -1,10 +1,10 @@
+use core::marker::PhantomData;
 use std::collections::HashMap;
-use std::marker::PhantomData;
 use std::time::Duration;
 
 use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
 use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
-use ethereum_types::H256;
+use ethereum_types::{BigEndianHash, H256};
 use keccak_hash::keccak;
 use log::info;
 use plonky2::field::goldilocks_field::GoldilocksField;
@@ -15,7 +15,7 @@ use plonky2_evm::all_stark::AllStark;
 use plonky2_evm::config::StarkConfig;
 use plonky2_evm::fixed_recursive_verifier::AllRecursiveCircuits;
 use plonky2_evm::generation::{GenerationInputs, TrieInputs};
-use plonky2_evm::proof::{BlockHashes, BlockMetadata, TrieRoots};
+use plonky2_evm::proof::{BlockHashes, BlockMetadata, PublicValues, TrieRoots};
 use plonky2_evm::Node;
 
 type F = GoldilocksField;
@@ -31,7 +31,10 @@ fn test_empty_txn_list() -> anyhow::Result<()> {
     let all_stark = AllStark::<F, D>::default();
     let config = StarkConfig::standard_fast_config();
 
-    let block_metadata = BlockMetadata::default();
+    let block_metadata = BlockMetadata {
+        block_number: 1.into(),
+        ..Default::default()
+    };
 
     let state_trie = HashedPartialTrie::from(Node::Empty);
     let transactions_trie = HashedPartialTrie::from(Node::Empty);
@@ -47,8 +50,11 @@ fn test_empty_txn_list() -> anyhow::Result<()> {
         transactions_root: transactions_trie.hash(),
         receipts_root: receipts_trie.hash(),
     };
+    let mut initial_block_hashes = vec![H256::default(); 256];
+    initial_block_hashes[255] = H256::from_uint(&0x200.into());
     let inputs = GenerationInputs {
-        signed_txns: vec![],
+        signed_txn: None,
+        withdrawals: vec![],
         tries: TrieInputs {
             state_trie,
             transactions_trie,
@@ -57,35 +63,33 @@ fn test_empty_txn_list() -> anyhow::Result<()> {
         },
         trie_roots_after,
         contract_code,
-        genesis_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
         block_metadata,
         txn_number_before: 0.into(),
         gas_used_before: 0.into(),
         gas_used_after: 0.into(),
-        block_bloom_before: [0.into(); 8],
-        block_bloom_after: [0.into(); 8],
         block_hashes: BlockHashes {
-            prev_hashes: vec![H256::default(); 256],
+            prev_hashes: initial_block_hashes,
             cur_hash: H256::default(),
         },
-        addresses: vec![],
     };
 
+    // Initialize the preprocessed circuits for the zkEVM.
     let all_circuits = AllRecursiveCircuits::<F, C, D>::new(
         &all_stark,
-        &[16..17, 10..11, 15..16, 14..15, 9..10, 12..13, 18..19], // Minimal ranges to prove an empty list
+        &[16..17, 9..11, 12..13, 14..15, 9..11, 12..13, 17..18], // Minimal ranges to prove an empty list
         &config,
     );
 
     {
         let gate_serializer = DefaultGateSerializer;
-        let generator_serializer = DefaultGeneratorSerializer {
+        let generator_serializer = DefaultGeneratorSerializer::<C, D> {
             _phantom: PhantomData::<C>,
         };
 
         let timing = TimingTree::new("serialize AllRecursiveCircuits", log::Level::Info);
         let all_circuits_bytes = all_circuits
-            .to_bytes(&gate_serializer, &generator_serializer)
+            .to_bytes(false, &gate_serializer, &generator_serializer)
             .map_err(|_| anyhow::Error::msg("AllRecursiveCircuits serialization failed."))?;
         timing.filter(Duration::from_millis(100)).print();
         info!(
@@ -96,6 +100,7 @@ fn test_empty_txn_list() -> anyhow::Result<()> {
         let timing = TimingTree::new("deserialize AllRecursiveCircuits", log::Level::Info);
         let all_circuits_from_bytes = AllRecursiveCircuits::<F, C, D>::from_bytes(
             &all_circuits_bytes,
+            false,
             &gate_serializer,
             &generator_serializer,
         )
@@ -107,17 +112,40 @@ fn test_empty_txn_list() -> anyhow::Result<()> {
 
     let mut timing = TimingTree::new("prove", log::Level::Info);
     let (root_proof, public_values) =
-        all_circuits.prove_root(&all_stark, &config, inputs, &mut timing)?;
+        all_circuits.prove_root(&all_stark, &config, inputs, &mut timing, None)?;
     timing.filter(Duration::from_millis(100)).print();
     all_circuits.verify_root(root_proof.clone())?;
 
+    // Test retrieved public values from the proof public inputs.
+    let retrieved_public_values = PublicValues::from_public_inputs(&root_proof.public_inputs);
+    assert_eq!(retrieved_public_values, public_values);
+
     // We can duplicate the proofs here because the state hasn't mutated.
-    let (agg_proof, public_values) =
-        all_circuits.prove_aggregation(false, &root_proof, false, &root_proof, public_values)?;
+    let (agg_proof, agg_public_values) = all_circuits.prove_aggregation(
+        false,
+        &root_proof,
+        public_values.clone(),
+        false,
+        &root_proof,
+        public_values,
+    )?;
     all_circuits.verify_aggregation(&agg_proof)?;
 
-    let (block_proof, _) = all_circuits.prove_block(None, &agg_proof, public_values)?;
-    all_circuits.verify_block(&block_proof)
+    // Test retrieved public values from the proof public inputs.
+    let retrieved_public_values = PublicValues::from_public_inputs(&agg_proof.public_inputs);
+    assert_eq!(retrieved_public_values, agg_public_values);
+
+    let (block_proof, block_public_values) =
+        all_circuits.prove_block(None, &agg_proof, agg_public_values)?;
+    all_circuits.verify_block(&block_proof)?;
+
+    // Test retrieved public values from the proof public inputs.
+    let retrieved_public_values = PublicValues::from_public_inputs(&block_proof.public_inputs);
+    assert_eq!(retrieved_public_values, block_public_values);
+
+    // Get the verifier associated to these preprocessed circuits, and have it verify the block_proof.
+    let verifier = all_circuits.final_verifier_data();
+    verifier.verify(block_proof)
 }
 
 fn init_logger() {
diff --git a/evm/tests/erc20.rs b/evm/tests/erc20.rs
new file mode 100644
index 0000000000..7ec40f0606
--- /dev/null
+++ b/evm/tests/erc20.rs
@@ -0,0 +1,288 @@
+use std::str::FromStr;
+use std::time::Duration;
+
+use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
+use eth_trie_utils::nibbles::Nibbles;
+use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
+use ethereum_types::{Address, BigEndianHash, H160, H256, U256};
+use hex_literal::hex;
+use keccak_hash::keccak;
+use plonky2::field::goldilocks_field::GoldilocksField;
+use plonky2::plonk::config::KeccakGoldilocksConfig;
+use plonky2::util::timing::TimingTree;
+use plonky2_evm::all_stark::AllStark;
+use plonky2_evm::config::StarkConfig;
+use plonky2_evm::generation::mpt::{AccountRlp, LegacyReceiptRlp, LogRlp};
+use plonky2_evm::generation::{GenerationInputs, TrieInputs};
+use plonky2_evm::proof::{BlockHashes, BlockMetadata, TrieRoots};
+use plonky2_evm::prover::prove;
+use plonky2_evm::verifier::verify_proof;
+use plonky2_evm::Node;
+
+type F = GoldilocksField;
+const D: usize = 2;
+type C = KeccakGoldilocksConfig;
+
+/// Test a simple ERC20 transfer.
+/// Used the following Solidity code:
+/// ```solidity
+/// pragma solidity ^0.8.13;
+/// import "../lib/openzeppelin-contracts/contracts/token/ERC20/ERC20.sol";
+/// contract Token is ERC20 {
+///     constructor() ERC20("Token", "TKN") {
+///         _mint(msg.sender, 1_000_000 ether);
+///     }
+/// }
+/// contract Giver {
+///     Token public token;
+///     constructor(address _token) {
+///         token = Token(_token);
+///     }
+///     function send(uint256 amount) public {
+///         token.transfer(0x1f9090aaE28b8a3dCeaDf281B0F12828e676c326, amount);
+///     }
+/// }
+/// ```
+#[test]
+fn test_erc20() -> anyhow::Result<()> {
+    init_logger();
+
+    let all_stark = AllStark::<F, D>::default();
+    let config = StarkConfig::standard_fast_config();
+
+    let beneficiary = hex!("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef");
+    let sender = hex!("70997970C51812dc3A010C7d01b50e0d17dc79C8");
+    let giver = hex!("e7f1725E7734CE288F8367e1Bb143E90bb3F0512");
+    let token = hex!("5FbDB2315678afecb367f032d93F642f64180aa3");
+
+    let sender_state_key = keccak(sender);
+    let giver_state_key = keccak(giver);
+    let token_state_key = keccak(token);
+
+    let sender_nibbles = Nibbles::from_bytes_be(sender_state_key.as_bytes()).unwrap();
+    let giver_nibbles = Nibbles::from_bytes_be(giver_state_key.as_bytes()).unwrap();
+    let token_nibbles = Nibbles::from_bytes_be(token_state_key.as_bytes()).unwrap();
+
+    let mut state_trie_before = HashedPartialTrie::from(Node::Empty);
+    state_trie_before.insert(sender_nibbles, rlp::encode(&sender_account()).to_vec());
+    state_trie_before.insert(giver_nibbles, rlp::encode(&giver_account()).to_vec());
+    state_trie_before.insert(token_nibbles, rlp::encode(&token_account()).to_vec());
+
+    let storage_tries = vec![
+        (giver_state_key, giver_storage()),
+        (token_state_key, token_storage()),
+    ];
+
+    let tries_before = TrieInputs {
+        state_trie: state_trie_before,
+        transactions_trie: HashedPartialTrie::from(Node::Empty),
+        receipts_trie: HashedPartialTrie::from(Node::Empty),
+        storage_tries,
+    };
+
+    let txn = signed_tx();
+
+    let gas_used = 56_499.into();
+    let bloom = bloom();
+    let block_metadata = BlockMetadata {
+        block_beneficiary: Address::from(beneficiary),
+        block_timestamp: 0x03e8.into(),
+        block_number: 1.into(),
+        block_difficulty: 0x020000.into(),
+        block_random: H256::from_uint(&0x020000.into()),
+        block_gaslimit: 0xff112233u32.into(),
+        block_chain_id: 1.into(),
+        block_base_fee: 0xa.into(),
+        block_gas_used: gas_used,
+        block_blob_base_fee: 0x2.into(),
+        block_bloom: bloom,
+    };
+
+    let contract_code = [giver_bytecode(), token_bytecode(), vec![]]
+        .map(|v| (keccak(v.clone()), v))
+        .into();
+
+    let expected_state_trie_after: HashedPartialTrie = {
+        let mut state_trie_after = HashedPartialTrie::from(Node::Empty);
+        let sender_account = sender_account();
+        let sender_account_after = AccountRlp {
+            nonce: sender_account.nonce + 1,
+            balance: sender_account.balance - gas_used * 0xa,
+            ..sender_account
+        };
+        state_trie_after.insert(sender_nibbles, rlp::encode(&sender_account_after).to_vec());
+        state_trie_after.insert(giver_nibbles, rlp::encode(&giver_account()).to_vec());
+        let token_account_after = AccountRlp {
+            storage_root: token_storage_after().hash(),
+            ..token_account()
+        };
+        state_trie_after.insert(token_nibbles, rlp::encode(&token_account_after).to_vec());
+
+        state_trie_after
+    };
+
+    let receipt_0 = LegacyReceiptRlp {
+        status: true,
+        cum_gas_used: gas_used,
+        bloom: bloom_bytes().to_vec().into(),
+        logs: vec![LogRlp {
+            address: H160::from_str("0x5fbdb2315678afecb367f032d93f642f64180aa3").unwrap(),
+            topics: vec![
+                H256::from_str(
+                    "0xddf252ad1be2c89b69c2b068fc378daa952ba7f163c4a11628f55a4df523b3ef",
+                )
+                .unwrap(),
+                H256::from_str(
+                    "0x000000000000000000000000e7f1725e7734ce288f8367e1bb143e90bb3f0512",
+                )
+                .unwrap(),
+                H256::from_str(
+                    "0x0000000000000000000000001f9090aae28b8a3dceadf281b0f12828e676c326",
+                )
+                .unwrap(),
+            ],
+            data: hex!("0000000000000000000000000000000000000000000000056bc75e2d63100000")
+                .to_vec()
+                .into(),
+        }],
+    };
+    let mut receipts_trie = HashedPartialTrie::from(Node::Empty);
+    receipts_trie.insert(Nibbles::from_str("0x80").unwrap(), receipt_0.encode(2));
+    let transactions_trie: HashedPartialTrie = Node::Leaf {
+        nibbles: Nibbles::from_str("0x80").unwrap(),
+        value: txn.to_vec(),
+    }
+    .into();
+
+    let trie_roots_after = TrieRoots {
+        state_root: expected_state_trie_after.hash(),
+        transactions_root: transactions_trie.hash(),
+        receipts_root: receipts_trie.hash(),
+    };
+    let inputs = GenerationInputs {
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
+        tries: tries_before,
+        trie_roots_after,
+        contract_code,
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        block_metadata,
+        txn_number_before: 0.into(),
+        gas_used_before: 0.into(),
+        gas_used_after: gas_used,
+        block_hashes: BlockHashes {
+            prev_hashes: vec![H256::default(); 256],
+            cur_hash: H256::default(),
+        },
+    };
+
+    let mut timing = TimingTree::new("prove", log::Level::Debug);
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
+    timing.filter(Duration::from_millis(100)).print();
+
+    verify_proof(&all_stark, proof, &config)
+}
+
+fn init_logger() {
+    let _ = try_init_from_env(Env::default().filter_or(DEFAULT_FILTER_ENV, "info"));
+}
+
+fn giver_bytecode() -> Vec<u8> {
+    hex!("608060405234801561001057600080fd5b50600436106100365760003560e01c8063a52c101e1461003b578063fc0c546a14610050575b600080fd5b61004e61004936600461010c565b61007f565b005b600054610063906001600160a01b031681565b6040516001600160a01b03909116815260200160405180910390f35b60005460405163a9059cbb60e01b8152731f9090aae28b8a3dceadf281b0f12828e676c3266004820152602481018390526001600160a01b039091169063a9059cbb906044016020604051808303816000875af11580156100e4573d6000803e3d6000fd5b505050506040513d601f19601f820116820180604052508101906101089190610125565b5050565b60006020828403121561011e57600080fd5b5035919050565b60006020828403121561013757600080fd5b8151801515811461014757600080fd5b939250505056fea264697066735822122050741efdbac11eb0bbb776ce3ac6004e596b7d7559658a12506164388c371cfd64736f6c63430008140033").into()
+}
+
+fn token_bytecode() -> Vec<u8> {
+    hex!("608060405234801561001057600080fd5b50600436106100935760003560e01c8063313ce56711610066578063313ce567146100fe57806370a082311461010d57806395d89b4114610136578063a9059cbb1461013e578063dd62ed3e1461015157600080fd5b806306fdde0314610098578063095ea7b3146100b657806318160ddd146100d957806323b872dd146100eb575b600080fd5b6100a061018a565b6040516100ad919061056a565b60405180910390f35b6100c96100c43660046105d4565b61021c565b60405190151581526020016100ad565b6002545b6040519081526020016100ad565b6100c96100f93660046105fe565b610236565b604051601281526020016100ad565b6100dd61011b36600461063a565b6001600160a01b031660009081526020819052604090205490565b6100a061025a565b6100c961014c3660046105d4565b610269565b6100dd61015f36600461065c565b6001600160a01b03918216600090815260016020908152604080832093909416825291909152205490565b6060600380546101999061068f565b80601f01602080910402602001604051908101604052809291908181526020018280546101c59061068f565b80156102125780601f106101e757610100808354040283529160200191610212565b820191906000526020600020905b8154815290600101906020018083116101f557829003601f168201915b5050505050905090565b60003361022a818585610277565b60019150505b92915050565b600033610244858285610289565b61024f85858561030c565b506001949350505050565b6060600480546101999061068f565b60003361022a81858561030c565b610284838383600161036b565b505050565b6001600160a01b03838116600090815260016020908152604080832093861683529290522054600019811461030657818110156102f757604051637dc7a0d960e11b81526001600160a01b038416600482015260248101829052604481018390526064015b60405180910390fd5b6103068484848403600061036b565b50505050565b6001600160a01b03831661033657604051634b637e8f60e11b8152600060048201526024016102ee565b6001600160a01b0382166103605760405163ec442f0560e01b8152600060048201526024016102ee565b610284838383610440565b6001600160a01b0384166103955760405163e602df0560e01b8152600060048201526024016102ee565b6001600160a01b0383166103bf57604051634a1406b160e11b8152600060048201526024016102ee565b6001600160a01b038085166000908152600160209081526040808320938716835292905220829055801561030657826001600160a01b0316846001600160a01b03167f8c5be1e5ebec7d5bd14f71427d1e84f3dd0314c0f7b2291e5b200ac8c7c3b9258460405161043291815260200190565b60405180910390a350505050565b6001600160a01b03831661046b57806002600082825461046091906106c9565b909155506104dd9050565b6001600160a01b038316600090815260208190526040902054818110156104be5760405163391434e360e21b81526001600160a01b038516600482015260248101829052604481018390526064016102ee565b6001600160a01b03841660009081526020819052604090209082900390555b6001600160a01b0382166104f957600280548290039055610518565b6001600160a01b03821660009081526020819052604090208054820190555b816001600160a01b0316836001600160a01b03167fddf252ad1be2c89b69c2b068fc378daa952ba7f163c4a11628f55a4df523b3ef8360405161055d91815260200190565b60405180910390a3505050565b600060208083528351808285015260005b818110156105975785810183015185820160400152820161057b565b506000604082860101526040601f19601f8301168501019250505092915050565b80356001600160a01b03811681146105cf57600080fd5b919050565b600080604083850312156105e757600080fd5b6105f0836105b8565b946020939093013593505050565b60008060006060848603121561061357600080fd5b61061c846105b8565b925061062a602085016105b8565b9150604084013590509250925092565b60006020828403121561064c57600080fd5b610655826105b8565b9392505050565b6000806040838503121561066f57600080fd5b610678836105b8565b9150610686602084016105b8565b90509250929050565b600181811c908216806106a357607f821691505b6020821081036106c357634e487b7160e01b600052602260045260246000fd5b50919050565b8082018082111561023057634e487b7160e01b600052601160045260246000fdfea2646970667358221220266a323ae4a816f6c6342a5be431fedcc0d45c44b02ea75f5474eb450b5d45b364736f6c63430008140033").into()
+}
+
+fn insert_storage(trie: &mut HashedPartialTrie, slot: U256, value: U256) {
+    let mut bytes = [0; 32];
+    slot.to_big_endian(&mut bytes);
+    let key = keccak(bytes);
+    let nibbles = Nibbles::from_bytes_be(key.as_bytes()).unwrap();
+    let r = rlp::encode(&value);
+    let r = r.freeze().to_vec();
+    trie.insert(nibbles, r);
+}
+
+fn sd2u(s: &str) -> U256 {
+    U256::from_dec_str(s).unwrap()
+}
+
+fn giver_storage() -> HashedPartialTrie {
+    let mut trie = HashedPartialTrie::from(Node::Empty);
+    insert_storage(
+        &mut trie,
+        U256::zero(),
+        sd2u("546584486846459126461364135121053344201067465379"),
+    );
+    trie
+}
+
+fn token_storage() -> HashedPartialTrie {
+    let mut trie = HashedPartialTrie::from(Node::Empty);
+    insert_storage(
+        &mut trie,
+        sd2u("82183438603287090451672504949863617512989139203883434767553028632841710582583"),
+        sd2u("1000000000000000000000"),
+    );
+    trie
+}
+
+fn token_storage_after() -> HashedPartialTrie {
+    let mut trie = HashedPartialTrie::from(Node::Empty);
+    insert_storage(
+        &mut trie,
+        sd2u("82183438603287090451672504949863617512989139203883434767553028632841710582583"),
+        sd2u("900000000000000000000"),
+    );
+    insert_storage(
+        &mut trie,
+        sd2u("53006154680716014998529145169423020330606407246856709517064848190396281160729"),
+        sd2u("100000000000000000000"),
+    );
+    trie
+}
+
+fn giver_account() -> AccountRlp {
+    AccountRlp {
+        nonce: 1.into(),
+        balance: 0.into(),
+        storage_root: giver_storage().hash(),
+        code_hash: keccak(giver_bytecode()),
+    }
+}
+
+fn token_account() -> AccountRlp {
+    AccountRlp {
+        nonce: 1.into(),
+        balance: 0.into(),
+        storage_root: token_storage().hash(),
+        code_hash: keccak(token_bytecode()),
+    }
+}
+
+fn sender_account() -> AccountRlp {
+    AccountRlp {
+        nonce: 0.into(),
+        balance: sd2u("10000000000000000000000"),
+        storage_root: Default::default(),
+        code_hash: keccak([]),
+    }
+}
+
+fn signed_tx() -> Vec<u8> {
+    hex!("02f88701800a0a830142c594e7f1725e7734ce288f8367e1bb143e90bb3f051280a4a52c101e0000000000000000000000000000000000000000000000056bc75e2d63100000c001a0303f5591159d7ea303faecb1c8bd8624b55732f769de28b111190dfb9a7c5234a019d5d6d38938dc1c63acbe106cf361672def773ace4ca587860117d057326627").into()
+}
+
+fn bloom_bytes() -> [u8; 256] {
+    hex!("00000000000000000400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008000000000008000000000000000000000000000000000040000000000000000000000000000000000000000000000014000000000000000000000000000000000000000000000000000000000000000200000000000000000000000000000002000000000000000000000000000000000000000000000042000000000000000000000000000000000000000000020000000000080000000000000000000000000000000000000000000000000000000000000000")
+}
+
+fn bloom() -> [U256; 8] {
+    let bloom = bloom_bytes()
+        .chunks_exact(32)
+        .map(U256::from_big_endian)
+        .collect::<Vec<_>>();
+    bloom.try_into().unwrap()
+}
diff --git a/evm/tests/erc721.rs b/evm/tests/erc721.rs
new file mode 100644
index 0000000000..0428204013
--- /dev/null
+++ b/evm/tests/erc721.rs
@@ -0,0 +1,315 @@
+use std::str::FromStr;
+use std::time::Duration;
+
+use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
+use eth_trie_utils::nibbles::Nibbles;
+use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
+use ethereum_types::{Address, BigEndianHash, H160, H256, U256};
+use hex_literal::hex;
+use keccak_hash::keccak;
+use plonky2::field::goldilocks_field::GoldilocksField;
+use plonky2::plonk::config::KeccakGoldilocksConfig;
+use plonky2::util::timing::TimingTree;
+use plonky2_evm::all_stark::AllStark;
+use plonky2_evm::config::StarkConfig;
+use plonky2_evm::generation::mpt::{AccountRlp, LegacyReceiptRlp, LogRlp};
+use plonky2_evm::generation::{GenerationInputs, TrieInputs};
+use plonky2_evm::proof::{BlockHashes, BlockMetadata, TrieRoots};
+use plonky2_evm::prover::prove;
+use plonky2_evm::verifier::verify_proof;
+use plonky2_evm::Node;
+
+type F = GoldilocksField;
+const D: usize = 2;
+type C = KeccakGoldilocksConfig;
+
+/// Test a simple ERC721 token transfer.
+/// Used the following Solidity code:
+/// ```solidity
+/// pragma solidity ^0.8.20;
+///
+/// import "@openzeppelin/contracts@5.0.1/token/ERC721/ERC721.sol";
+/// import "@openzeppelin/contracts@5.0.1/access/Ownable.sol";
+///
+/// contract TestToken is ERC721, Ownable {
+///     constructor(address initialOwner)
+///         ERC721("TestToken", "TEST")
+///         Ownable(initialOwner)
+///     {}
+///
+///     function safeMint(address to, uint256 tokenId) public onlyOwner {
+///         _safeMint(to, tokenId);
+///     }
+/// }
+/// ```
+///
+/// The transaction calls the `safeTransferFrom` function to transfer token `1337` from address
+/// `0x5B38Da6a701c568545dCfcB03FcB875f56beddC4` to address `0xAb8483F64d9C6d1EcF9b849Ae677dD3315835cb2`.
+#[test]
+fn test_erc721() -> anyhow::Result<()> {
+    init_logger();
+
+    let all_stark = AllStark::<F, D>::default();
+    let config = StarkConfig::standard_fast_config();
+
+    let beneficiary = hex!("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef");
+    let owner = hex!("5B38Da6a701c568545dCfcB03FcB875f56beddC4");
+    let contract = hex!("f2B1114C644cBb3fF63Bf1dD284c8Cd716e95BE9");
+
+    let owner_state_key = keccak(owner);
+    let contract_state_key = keccak(contract);
+
+    let owner_nibbles = Nibbles::from_bytes_be(owner_state_key.as_bytes()).unwrap();
+    let contract_nibbles = Nibbles::from_bytes_be(contract_state_key.as_bytes()).unwrap();
+
+    let mut state_trie_before = HashedPartialTrie::from(Node::Empty);
+    state_trie_before.insert(owner_nibbles, rlp::encode(&owner_account()).to_vec());
+    state_trie_before.insert(contract_nibbles, rlp::encode(&contract_account()).to_vec());
+
+    let storage_tries = vec![(contract_state_key, contract_storage())];
+
+    let tries_before = TrieInputs {
+        state_trie: state_trie_before,
+        transactions_trie: HashedPartialTrie::from(Node::Empty),
+        receipts_trie: HashedPartialTrie::from(Node::Empty),
+        storage_tries,
+    };
+
+    let txn = signed_tx();
+
+    let gas_used = 58_418.into();
+
+    let contract_code = [contract_bytecode(), vec![]]
+        .map(|v| (keccak(v.clone()), v))
+        .into();
+
+    let expected_state_trie_after: HashedPartialTrie = {
+        let mut state_trie_after = HashedPartialTrie::from(Node::Empty);
+        let owner_account = owner_account();
+        let owner_account_after = AccountRlp {
+            nonce: owner_account.nonce + 1,
+            balance: owner_account.balance - gas_used * 0xa,
+            ..owner_account
+        };
+        state_trie_after.insert(owner_nibbles, rlp::encode(&owner_account_after).to_vec());
+        let contract_account_after = AccountRlp {
+            storage_root: contract_storage_after().hash(),
+            ..contract_account()
+        };
+        state_trie_after.insert(
+            contract_nibbles,
+            rlp::encode(&contract_account_after).to_vec(),
+        );
+
+        state_trie_after
+    };
+
+    let logs = vec![LogRlp {
+        address: H160::from_str("0xf2B1114C644cBb3fF63Bf1dD284c8Cd716e95BE9").unwrap(),
+        topics: vec![
+            H256::from_str("0xddf252ad1be2c89b69c2b068fc378daa952ba7f163c4a11628f55a4df523b3ef")
+                .unwrap(),
+            H256::from_str("0x0000000000000000000000005b38da6a701c568545dcfcb03fcb875f56beddc4")
+                .unwrap(),
+            H256::from_str("0x000000000000000000000000ab8483f64d9c6d1ecf9b849ae677dd3315835cb2")
+                .unwrap(),
+            H256::from_str("0x0000000000000000000000000000000000000000000000000000000000000539")
+                .unwrap(),
+        ],
+        data: vec![].into(),
+    }];
+
+    let mut bloom_bytes = [0u8; 256];
+    add_logs_to_bloom(&mut bloom_bytes, &logs);
+
+    let receipt_0 = LegacyReceiptRlp {
+        status: true,
+        cum_gas_used: gas_used,
+        bloom: bloom_bytes.to_vec().into(),
+        logs,
+    };
+    let mut receipts_trie = HashedPartialTrie::from(Node::Empty);
+    receipts_trie.insert(Nibbles::from_str("0x80").unwrap(), receipt_0.encode(0));
+    let transactions_trie: HashedPartialTrie = Node::Leaf {
+        nibbles: Nibbles::from_str("0x80").unwrap(),
+        value: txn.to_vec(),
+    }
+    .into();
+
+    let trie_roots_after = TrieRoots {
+        state_root: expected_state_trie_after.hash(),
+        transactions_root: transactions_trie.hash(),
+        receipts_root: receipts_trie.hash(),
+    };
+
+    let bloom = bloom_bytes
+        .chunks_exact(32)
+        .map(U256::from_big_endian)
+        .collect::<Vec<_>>();
+
+    let block_metadata = BlockMetadata {
+        block_beneficiary: Address::from(beneficiary),
+        block_timestamp: 0x03e8.into(),
+        block_number: 1.into(),
+        block_difficulty: 0x020000.into(),
+        block_random: H256::from_uint(&0x020000.into()),
+        block_gaslimit: 0xff112233u32.into(),
+        block_chain_id: 1.into(),
+        block_base_fee: 0xa.into(),
+        block_gas_used: gas_used,
+        block_blob_base_fee: 0x2.into(),
+        block_bloom: bloom.try_into().unwrap(),
+    };
+
+    let inputs = GenerationInputs {
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
+        tries: tries_before,
+        trie_roots_after,
+        contract_code,
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        block_metadata,
+        txn_number_before: 0.into(),
+        gas_used_before: 0.into(),
+        gas_used_after: gas_used,
+        block_hashes: BlockHashes {
+            prev_hashes: vec![H256::default(); 256],
+            cur_hash: H256::default(),
+        },
+    };
+
+    let mut timing = TimingTree::new("prove", log::Level::Debug);
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
+    timing.filter(Duration::from_millis(100)).print();
+
+    verify_proof(&all_stark, proof, &config)
+}
+
+fn init_logger() {
+    let _ = try_init_from_env(Env::default().filter_or(DEFAULT_FILTER_ENV, "info"));
+}
+
+fn contract_bytecode() -> Vec<u8> {
+    hex!("608060405234801561000f575f80fd5b5060043610610109575f3560e01c8063715018a6116100a0578063a22cb4651161006f578063a22cb465146102a1578063b88d4fde146102bd578063c87b56dd146102d9578063e985e9c514610309578063f2fde38b1461033957610109565b8063715018a61461023f5780638da5cb5b1461024957806395d89b4114610267578063a14481941461028557610109565b806323b872dd116100dc57806323b872dd146101a757806342842e0e146101c35780636352211e146101df57806370a082311461020f57610109565b806301ffc9a71461010d57806306fdde031461013d578063081812fc1461015b578063095ea7b31461018b575b5f80fd5b61012760048036038101906101229190611855565b610355565b604051610134919061189a565b60405180910390f35b610145610436565b604051610152919061193d565b60405180910390f35b61017560048036038101906101709190611990565b6104c5565b60405161018291906119fa565b60405180910390f35b6101a560048036038101906101a09190611a3d565b6104e0565b005b6101c160048036038101906101bc9190611a7b565b6104f6565b005b6101dd60048036038101906101d89190611a7b565b6105f5565b005b6101f960048036038101906101f49190611990565b610614565b60405161020691906119fa565b60405180910390f35b61022960048036038101906102249190611acb565b610625565b6040516102369190611b05565b60405180910390f35b6102476106db565b005b6102516106ee565b60405161025e91906119fa565b60405180910390f35b61026f610716565b60405161027c919061193d565b60405180910390f35b61029f600480360381019061029a9190611a3d565b6107a6565b005b6102bb60048036038101906102b69190611b48565b6107bc565b005b6102d760048036038101906102d29190611cb2565b6107d2565b005b6102f360048036038101906102ee9190611990565b6107ef565b604051610300919061193d565b60405180910390f35b610323600480360381019061031e9190611d32565b610855565b604051610330919061189a565b60405180910390f35b610353600480360381019061034e9190611acb565b6108e3565b005b5f7f80ac58cd000000000000000000000000000000000000000000000000000000007bffffffffffffffffffffffffffffffffffffffffffffffffffffffff1916827bffffffffffffffffffffffffffffffffffffffffffffffffffffffff1916148061041f57507f5b5e139f000000000000000000000000000000000000000000000000000000007bffffffffffffffffffffffffffffffffffffffffffffffffffffffff1916827bffffffffffffffffffffffffffffffffffffffffffffffffffffffff1916145b8061042f575061042e82610967565b5b9050919050565b60605f805461044490611d9d565b80601f016020809104026020016040519081016040528092919081815260200182805461047090611d9d565b80156104bb5780601f10610492576101008083540402835291602001916104bb565b820191905f5260205f20905b81548152906001019060200180831161049e57829003601f168201915b5050505050905090565b5f6104cf826109d0565b506104d982610a56565b9050919050565b6104f282826104ed610a8f565b610a96565b5050565b5f73ffffffffffffffffffffffffffffffffffffffff168273ffffffffffffffffffffffffffffffffffffffff1603610566575f6040517f64a0ae9200000000000000000000000000000000000000000000000000000000815260040161055d91906119fa565b60405180910390fd5b5f6105798383610574610a8f565b610aa8565b90508373ffffffffffffffffffffffffffffffffffffffff168173ffffffffffffffffffffffffffffffffffffffff16146105ef578382826040517f64283d7b0000000000000000000000000000000000000000000000000000000081526004016105e693929190611dcd565b60405180910390fd5b50505050565b61060f83838360405180602001604052805f8152506107d2565b505050565b5f61061e826109d0565b9050919050565b5f8073ffffffffffffffffffffffffffffffffffffffff168273ffffffffffffffffffffffffffffffffffffffff1603610696575f6040517f89c62b6400000000000000000000000000000000000000000000000000000000815260040161068d91906119fa565b60405180910390fd5b60035f8373ffffffffffffffffffffffffffffffffffffffff1673ffffffffffffffffffffffffffffffffffffffff1681526020019081526020015f20549050919050565b6106e3610cb3565b6106ec5f610d3a565b565b5f60065f9054906101000a900473ffffffffffffffffffffffffffffffffffffffff16905090565b60606001805461072590611d9d565b80601f016020809104026020016040519081016040528092919081815260200182805461075190611d9d565b801561079c5780601f106107735761010080835404028352916020019161079c565b820191905f5260205f20905b81548152906001019060200180831161077f57829003601f168201915b5050505050905090565b6107ae610cb3565b6107b88282610dfd565b5050565b6107ce6107c7610a8f565b8383610e1a565b5050565b6107dd8484846104f6565b6107e984848484610f83565b50505050565b60606107fa826109d0565b505f610804611135565b90505f8151116108225760405180602001604052805f81525061084d565b8061082c8461114b565b60405160200161083d929190611e3c565b6040516020818303038152906040525b915050919050565b5f60055f8473ffffffffffffffffffffffffffffffffffffffff1673ffffffffffffffffffffffffffffffffffffffff1681526020019081526020015f205f8373ffffffffffffffffffffffffffffffffffffffff1673ffffffffffffffffffffffffffffffffffffffff1681526020019081526020015f205f9054906101000a900460ff16905092915050565b6108eb610cb3565b5f73ffffffffffffffffffffffffffffffffffffffff168173ffffffffffffffffffffffffffffffffffffffff160361095b575f6040517f1e4fbdf700000000000000000000000000000000000000000000000000000000815260040161095291906119fa565b60405180910390fd5b61096481610d3a565b50565b5f7f01ffc9a7000000000000000000000000000000000000000000000000000000007bffffffffffffffffffffffffffffffffffffffffffffffffffffffff1916827bffffffffffffffffffffffffffffffffffffffffffffffffffffffff1916149050919050565b5f806109db83611215565b90505f73ffffffffffffffffffffffffffffffffffffffff168173ffffffffffffffffffffffffffffffffffffffff1603610a4d57826040517f7e273289000000000000000000000000000000000000000000000000000000008152600401610a449190611b05565b60405180910390fd5b80915050919050565b5f60045f8381526020019081526020015f205f9054906101000a900473ffffffffffffffffffffffffffffffffffffffff169050919050565b5f33905090565b610aa3838383600161124e565b505050565b5f80610ab384611215565b90505f73ffffffffffffffffffffffffffffffffffffffff168373ffffffffffffffffffffffffffffffffffffffff1614610af457610af381848661140d565b5b5f73ffffffffffffffffffffffffffffffffffffffff168173ffffffffffffffffffffffffffffffffffffffff1614610b7f57610b335f855f8061124e565b600160035f8373ffffffffffffffffffffffffffffffffffffffff1673ffffffffffffffffffffffffffffffffffffffff1681526020019081526020015f205f82825403925050819055505b5f73ffffffffffffffffffffffffffffffffffffffff168573ffffffffffffffffffffffffffffffffffffffff1614610bfe57600160035f8773ffffffffffffffffffffffffffffffffffffffff1673ffffffffffffffffffffffffffffffffffffffff1681526020019081526020015f205f82825401925050819055505b8460025f8681526020019081526020015f205f6101000a81548173ffffffffffffffffffffffffffffffffffffffff021916908373ffffffffffffffffffffffffffffffffffffffff160217905550838573ffffffffffffffffffffffffffffffffffffffff168273ffffffffffffffffffffffffffffffffffffffff167fddf252ad1be2c89b69c2b068fc378daa952ba7f163c4a11628f55a4df523b3ef60405160405180910390a4809150509392505050565b610cbb610a8f565b73ffffffffffffffffffffffffffffffffffffffff16610cd96106ee565b73ffffffffffffffffffffffffffffffffffffffff1614610d3857610cfc610a8f565b6040517f118cdaa7000000000000000000000000000000000000000000000000000000008152600401610d2f91906119fa565b60405180910390fd5b565b5f60065f9054906101000a900473ffffffffffffffffffffffffffffffffffffffff1690508160065f6101000a81548173ffffffffffffffffffffffffffffffffffffffff021916908373ffffffffffffffffffffffffffffffffffffffff1602179055508173ffffffffffffffffffffffffffffffffffffffff168173ffffffffffffffffffffffffffffffffffffffff167f8be0079c531659141344cd1fd0a4f28419497f9722a3daafe3b4186f6b6457e060405160405180910390a35050565b610e16828260405180602001604052805f8152506114d0565b5050565b5f73ffffffffffffffffffffffffffffffffffffffff168273ffffffffffffffffffffffffffffffffffffffff1603610e8a57816040517f5b08ba18000000000000000000000000000000000000000000000000000000008152600401610e8191906119fa565b60405180910390fd5b8060055f8573ffffffffffffffffffffffffffffffffffffffff1673ffffffffffffffffffffffffffffffffffffffff1681526020019081526020015f205f8473ffffffffffffffffffffffffffffffffffffffff1673ffffffffffffffffffffffffffffffffffffffff1681526020019081526020015f205f6101000a81548160ff0219169083151502179055508173ffffffffffffffffffffffffffffffffffffffff168373ffffffffffffffffffffffffffffffffffffffff167f17307eab39ab6107e8899845ad3d59bd9653f200f220920489ca2b5937696c3183604051610f76919061189a565b60405180910390a3505050565b5f8373ffffffffffffffffffffffffffffffffffffffff163b111561112f578273ffffffffffffffffffffffffffffffffffffffff1663150b7a02610fc6610a8f565b8685856040518563ffffffff1660e01b8152600401610fe89493929190611eb1565b6020604051808303815f875af192505050801561102357506040513d601f19601f820116820180604052508101906110209190611f0f565b60015b6110a4573d805f8114611051576040519150601f19603f3d011682016040523d82523d5f602084013e611056565b606091505b505f81510361109c57836040517f64a0ae9200000000000000000000000000000000000000000000000000000000815260040161109391906119fa565b60405180910390fd5b805181602001fd5b63150b7a0260e01b7bffffffffffffffffffffffffffffffffffffffffffffffffffffffff1916817bffffffffffffffffffffffffffffffffffffffffffffffffffffffff19161461112d57836040517f64a0ae9200000000000000000000000000000000000000000000000000000000815260040161112491906119fa565b60405180910390fd5b505b50505050565b606060405180602001604052805f815250905090565b60605f6001611159846114eb565b0190505f8167ffffffffffffffff81111561117757611176611b8e565b5b6040519080825280601f01601f1916602001820160405280156111a95781602001600182028036833780820191505090505b5090505f82602001820190505b60011561120a578080600190039150507f3031323334353637383961626364656600000000000000000000000000000000600a86061a8153600a85816111ff576111fe611f3a565b5b0494505f85036111b6575b819350505050919050565b5f60025f8381526020019081526020015f205f9054906101000a900473ffffffffffffffffffffffffffffffffffffffff169050919050565b808061128657505f73ffffffffffffffffffffffffffffffffffffffff168273ffffffffffffffffffffffffffffffffffffffff1614155b156113b8575f611295846109d0565b90505f73ffffffffffffffffffffffffffffffffffffffff168373ffffffffffffffffffffffffffffffffffffffff16141580156112ff57508273ffffffffffffffffffffffffffffffffffffffff168173ffffffffffffffffffffffffffffffffffffffff1614155b801561131257506113108184610855565b155b1561135457826040517fa9fbf51f00000000000000000000000000000000000000000000000000000000815260040161134b91906119fa565b60405180910390fd5b81156113b657838573ffffffffffffffffffffffffffffffffffffffff168273ffffffffffffffffffffffffffffffffffffffff167f8c5be1e5ebec7d5bd14f71427d1e84f3dd0314c0f7b2291e5b200ac8c7c3b92560405160405180910390a45b505b8360045f8581526020019081526020015f205f6101000a81548173ffffffffffffffffffffffffffffffffffffffff021916908373ffffffffffffffffffffffffffffffffffffffff16021790555050505050565b61141883838361163c565b6114cb575f73ffffffffffffffffffffffffffffffffffffffff168373ffffffffffffffffffffffffffffffffffffffff160361148c57806040517f7e2732890000000000000000000000000000000000000000000000000000000081526004016114839190611b05565b60405180910390fd5b81816040517f177e802f0000000000000000000000000000000000000000000000000000000081526004016114c2929190611f67565b60405180910390fd5b505050565b6114da83836116fc565b6114e65f848484610f83565b505050565b5f805f90507a184f03e93ff9f4daa797ed6e38ed64bf6a1f0100000000000000008310611547577a184f03e93ff9f4daa797ed6e38ed64bf6a1f010000000000000000838161153d5761153c611f3a565b5b0492506040810190505b6d04ee2d6d415b85acef81000000008310611584576d04ee2d6d415b85acef8100000000838161157a57611579611f3a565b5b0492506020810190505b662386f26fc1000083106115b357662386f26fc1000083816115a9576115a8611f3a565b5b0492506010810190505b6305f5e10083106115dc576305f5e10083816115d2576115d1611f3a565b5b0492506008810190505b61271083106116015761271083816115f7576115f6611f3a565b5b0492506004810190505b60648310611624576064838161161a57611619611f3a565b5b0492506002810190505b600a8310611633576001810190505b80915050919050565b5f8073ffffffffffffffffffffffffffffffffffffffff168373ffffffffffffffffffffffffffffffffffffffff16141580156116f357508273ffffffffffffffffffffffffffffffffffffffff168473ffffffffffffffffffffffffffffffffffffffff1614806116b457506116b38484610855565b5b806116f257508273ffffffffffffffffffffffffffffffffffffffff166116da83610a56565b73ffffffffffffffffffffffffffffffffffffffff16145b5b90509392505050565b5f73ffffffffffffffffffffffffffffffffffffffff168273ffffffffffffffffffffffffffffffffffffffff160361176c575f6040517f64a0ae9200000000000000000000000000000000000000000000000000000000815260040161176391906119fa565b60405180910390fd5b5f61177883835f610aa8565b90505f73ffffffffffffffffffffffffffffffffffffffff168173ffffffffffffffffffffffffffffffffffffffff16146117ea575f6040517f73c6ac6e0000000000000000000000000000000000000000000000000000000081526004016117e191906119fa565b60405180910390fd5b505050565b5f604051905090565b5f80fd5b5f80fd5b5f7fffffffff0000000000000000000000000000000000000000000000000000000082169050919050565b61183481611800565b811461183e575f80fd5b50565b5f8135905061184f8161182b565b92915050565b5f6020828403121561186a576118696117f8565b5b5f61187784828501611841565b91505092915050565b5f8115159050919050565b61189481611880565b82525050565b5f6020820190506118ad5f83018461188b565b92915050565b5f81519050919050565b5f82825260208201905092915050565b5f5b838110156118ea5780820151818401526020810190506118cf565b5f8484015250505050565b5f601f19601f8301169050919050565b5f61190f826118b3565b61191981856118bd565b93506119298185602086016118cd565b611932816118f5565b840191505092915050565b5f6020820190508181035f8301526119558184611905565b905092915050565b5f819050919050565b61196f8161195d565b8114611979575f80fd5b50565b5f8135905061198a81611966565b92915050565b5f602082840312156119a5576119a46117f8565b5b5f6119b28482850161197c565b91505092915050565b5f73ffffffffffffffffffffffffffffffffffffffff82169050919050565b5f6119e4826119bb565b9050919050565b6119f4816119da565b82525050565b5f602082019050611a0d5f8301846119eb565b92915050565b611a1c816119da565b8114611a26575f80fd5b50565b5f81359050611a3781611a13565b92915050565b5f8060408385031215611a5357611a526117f8565b5b5f611a6085828601611a29565b9250506020611a718582860161197c565b9150509250929050565b5f805f60608486031215611a9257611a916117f8565b5b5f611a9f86828701611a29565b9350506020611ab086828701611a29565b9250506040611ac18682870161197c565b9150509250925092565b5f60208284031215611ae057611adf6117f8565b5b5f611aed84828501611a29565b91505092915050565b611aff8161195d565b82525050565b5f602082019050611b185f830184611af6565b92915050565b611b2781611880565b8114611b31575f80fd5b50565b5f81359050611b4281611b1e565b92915050565b5f8060408385031215611b5e57611b5d6117f8565b5b5f611b6b85828601611a29565b9250506020611b7c85828601611b34565b9150509250929050565b5f80fd5b5f80fd5b7f4e487b71000000000000000000000000000000000000000000000000000000005f52604160045260245ffd5b611bc4826118f5565b810181811067ffffffffffffffff82111715611be357611be2611b8e565b5b80604052505050565b5f611bf56117ef565b9050611c018282611bbb565b919050565b5f67ffffffffffffffff821115611c2057611c1f611b8e565b5b611c29826118f5565b9050602081019050919050565b828183375f83830152505050565b5f611c56611c5184611c06565b611bec565b905082815260208101848484011115611c7257611c71611b8a565b5b611c7d848285611c36565b509392505050565b5f82601f830112611c9957611c98611b86565b5b8135611ca9848260208601611c44565b91505092915050565b5f805f8060808587031215611cca57611cc96117f8565b5b5f611cd787828801611a29565b9450506020611ce887828801611a29565b9350506040611cf98782880161197c565b925050606085013567ffffffffffffffff811115611d1a57611d196117fc565b5b611d2687828801611c85565b91505092959194509250565b5f8060408385031215611d4857611d476117f8565b5b5f611d5585828601611a29565b9250506020611d6685828601611a29565b9150509250929050565b7f4e487b71000000000000000000000000000000000000000000000000000000005f52602260045260245ffd5b5f6002820490506001821680611db457607f821691505b602082108103611dc757611dc6611d70565b5b50919050565b5f606082019050611de05f8301866119eb565b611ded6020830185611af6565b611dfa60408301846119eb565b949350505050565b5f81905092915050565b5f611e16826118b3565b611e208185611e02565b9350611e308185602086016118cd565b80840191505092915050565b5f611e478285611e0c565b9150611e538284611e0c565b91508190509392505050565b5f81519050919050565b5f82825260208201905092915050565b5f611e8382611e5f565b611e8d8185611e69565b9350611e9d8185602086016118cd565b611ea6816118f5565b840191505092915050565b5f608082019050611ec45f8301876119eb565b611ed160208301866119eb565b611ede6040830185611af6565b8181036060830152611ef08184611e79565b905095945050505050565b5f81519050611f098161182b565b92915050565b5f60208284031215611f2457611f236117f8565b5b5f611f3184828501611efb565b91505092915050565b7f4e487b71000000000000000000000000000000000000000000000000000000005f52601260045260245ffd5b5f604082019050611f7a5f8301856119eb565b611f876020830184611af6565b939250505056fea2646970667358221220432b30673e00c0eb009e1718c271f4cfdfbeded17345829703b06d322360990164736f6c63430008160033").into()
+}
+
+fn insert_storage(trie: &mut HashedPartialTrie, slot: U256, value: U256) {
+    let mut bytes = [0; 32];
+    slot.to_big_endian(&mut bytes);
+    let key = keccak(bytes);
+    let nibbles = Nibbles::from_bytes_be(key.as_bytes()).unwrap();
+    let r = rlp::encode(&value);
+    let r = r.freeze().to_vec();
+    trie.insert(nibbles, r);
+}
+
+fn sd2u(s: &str) -> U256 {
+    U256::from_dec_str(s).unwrap()
+}
+
+fn sh2u(s: &str) -> U256 {
+    U256::from_str_radix(s, 16).unwrap()
+}
+
+fn contract_storage() -> HashedPartialTrie {
+    let mut trie = HashedPartialTrie::from(Node::Empty);
+    insert_storage(
+        &mut trie,
+        U256::zero(),
+        sh2u("0x54657374546f6b656e0000000000000000000000000000000000000000000012"),
+    );
+    insert_storage(
+        &mut trie,
+        U256::one(),
+        sh2u("0x5445535400000000000000000000000000000000000000000000000000000008"),
+    );
+    insert_storage(
+        &mut trie,
+        sd2u("6"),
+        sh2u("0x5b38da6a701c568545dcfcb03fcb875f56beddc4"),
+    );
+    insert_storage(
+        &mut trie,
+        sh2u("0x343ff8127bd64f680be4e996254dc3528603c6ecd54364b4cf956ebdd28f0028"),
+        sh2u("0x5b38da6a701c568545dcfcb03fcb875f56beddc4"),
+    );
+    insert_storage(
+        &mut trie,
+        sh2u("0x118c1ea466562cb796e30ef705e4db752f5c39d773d22c5efd8d46f67194e78a"),
+        sd2u("1"),
+    );
+    trie
+}
+
+fn contract_storage_after() -> HashedPartialTrie {
+    let mut trie = HashedPartialTrie::from(Node::Empty);
+    insert_storage(
+        &mut trie,
+        U256::zero(),
+        sh2u("0x54657374546f6b656e0000000000000000000000000000000000000000000012"),
+    );
+    insert_storage(
+        &mut trie,
+        U256::one(),
+        sh2u("0x5445535400000000000000000000000000000000000000000000000000000008"),
+    );
+    insert_storage(
+        &mut trie,
+        sd2u("6"),
+        sh2u("0x5b38da6a701c568545dcfcb03fcb875f56beddc4"),
+    );
+    insert_storage(
+        &mut trie,
+        sh2u("0x343ff8127bd64f680be4e996254dc3528603c6ecd54364b4cf956ebdd28f0028"),
+        sh2u("0xab8483f64d9c6d1ecf9b849ae677dd3315835cb2"),
+    );
+    insert_storage(
+        &mut trie,
+        sh2u("0xf3aa6a8a9f7e3707e36cc99c499a27514922afe861ec3d80a1a314409cba92f9"),
+        sd2u("1"),
+    );
+    trie
+}
+
+fn owner_account() -> AccountRlp {
+    AccountRlp {
+        nonce: 2.into(),
+        balance: 0x1000000.into(),
+        storage_root: HashedPartialTrie::from(Node::Empty).hash(),
+        code_hash: keccak([]),
+    }
+}
+
+fn contract_account() -> AccountRlp {
+    AccountRlp {
+        nonce: 0.into(),
+        balance: 0.into(),
+        storage_root: contract_storage().hash(),
+        code_hash: keccak(contract_bytecode()),
+    }
+}
+
+fn signed_tx() -> Vec<u8> {
+    hex!("f8c5020a8307a12094f2b1114c644cbb3ff63bf1dd284c8cd716e95be980b86442842e0e0000000000000000000000005b38da6a701c568545dcfcb03fcb875f56beddc4000000000000000000000000ab8483f64d9c6d1ecf9b849ae677dd3315835cb2000000000000000000000000000000000000000000000000000000000000053925a0414867f13ac63d663e84099d52c8215615666ea37c969c69aa58a0fad26a3f6ea01a7160c6274969083b2316eb8ca6011b4bf6b00972159a78bf64d06fa40c1402").into()
+}
+
+fn add_logs_to_bloom(bloom: &mut [u8; 256], logs: &Vec<LogRlp>) {
+    for log in logs {
+        add_to_bloom(bloom, log.address.as_bytes());
+        for topic in &log.topics {
+            add_to_bloom(bloom, topic.as_bytes());
+        }
+    }
+}
+
+fn add_to_bloom(bloom: &mut [u8; 256], bloom_entry: &[u8]) {
+    let bloom_hash = keccak(bloom_entry).to_fixed_bytes();
+
+    for idx in 0..3 {
+        let bit_pair = u16::from_be_bytes(bloom_hash[2 * idx..2 * (idx + 1)].try_into().unwrap());
+        let bit_to_set = 0x07FF - (bit_pair & 0x07FF);
+        let byte_index = bit_to_set / 8;
+        let bit_value = 1 << (7 - bit_to_set % 8);
+        bloom[byte_index as usize] |= bit_value;
+    }
+}
diff --git a/evm/tests/log_opcode.rs b/evm/tests/log_opcode.rs
index 21bc56c48d..017f5e970f 100644
--- a/evm/tests/log_opcode.rs
+++ b/evm/tests/log_opcode.rs
@@ -1,5 +1,3 @@
-#![allow(clippy::upper_case_acronyms)]
-
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;
@@ -20,7 +18,7 @@ use plonky2_evm::fixed_recursive_verifier::AllRecursiveCircuits;
 use plonky2_evm::generation::mpt::transaction_testing::{AddressOption, LegacyTransactionRlp};
 use plonky2_evm::generation::mpt::{AccountRlp, LegacyReceiptRlp, LogRlp};
 use plonky2_evm::generation::{GenerationInputs, TrieInputs};
-use plonky2_evm::proof::{BlockHashes, BlockMetadata, ExtraBlockData, PublicValues, TrieRoots};
+use plonky2_evm::proof::{BlockHashes, BlockMetadata, TrieRoots};
 use plonky2_evm::prover::prove;
 use plonky2_evm::verifier::verify_proof;
 use plonky2_evm::Node;
@@ -215,42 +213,27 @@ fn test_log_opcodes() -> anyhow::Result<()> {
         transactions_root: transactions_trie.hash(),
         receipts_root: receipts_trie.hash(),
     };
-    let block_bloom_after = [
-        U256::from_dec_str("392318858461667547739736838950479151006397215279002157056").unwrap(),
-        0.into(),
-        U256::from_dec_str(
-            "55213970774324510299478046898216203619608871777363092441300193790394368",
-        )
-        .unwrap(),
-        U256::from_dec_str("1361129467683753853853498429727072845824").unwrap(),
-        U256::from_dec_str("33554432").unwrap(),
-        U256::from_dec_str("98079714615416886934934209737619787760822675856605315072").unwrap(),
-        U256::from_dec_str("262144").unwrap(),
-        U256::from_dec_str("6739986666787659948666753771754908317446393422488596686587943714816")
-            .unwrap(),
-    ];
+
     let inputs = GenerationInputs {
-        signed_txns: vec![txn.to_vec()],
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
         tries: tries_before,
         trie_roots_after,
         contract_code,
-        genesis_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
         block_metadata,
         txn_number_before: 0.into(),
         gas_used_before: 0.into(),
         gas_used_after: gas_used.into(),
-        block_bloom_before: [0.into(); 8],
-        block_bloom_after,
 
         block_hashes: BlockHashes {
             prev_hashes: vec![H256::default(); 256],
             cur_hash: H256::default(),
         },
-        addresses: vec![],
     };
 
     let mut timing = TimingTree::new("prove", log::Level::Debug);
-    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing)?;
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
     timing.filter(Duration::from_millis(100)).print();
 
     // Assert that the proof leads to the correct state and receipt roots.
@@ -340,7 +323,7 @@ fn test_log_with_aggreg() -> anyhow::Result<()> {
         to_second_nibbles,
         rlp::encode(&to_account_second_before).to_vec(),
     );
-    let genesis_state_trie_root = state_trie_before.hash();
+    let checkpoint_state_trie_root = state_trie_before.hash();
 
     let tries_before = TrieInputs {
         state_trie: state_trie_before,
@@ -351,10 +334,10 @@ fn test_log_with_aggreg() -> anyhow::Result<()> {
 
     let txn = hex!("f85f800a82520894095e7baea6a6c7c4c2dfeb977efac326af552d870a8026a0122f370ed4023a6c253350c6bfb87d7d7eb2cd86447befee99e0a26b70baec20a07100ab1b3977f2b4571202b9f4b68850858caf5469222794600b5ce1cfb348ad");
 
-    let block_metadata = BlockMetadata {
+    let block_1_metadata = BlockMetadata {
         block_beneficiary: Address::from(beneficiary),
         block_timestamp: 0x03e8.into(),
-        block_number: 0.into(),
+        block_number: 1.into(),
         block_difficulty: 0x020000.into(),
         block_gaslimit: 0x445566u32.into(),
         block_chain_id: 1.into(),
@@ -437,42 +420,43 @@ fn test_log_with_aggreg() -> anyhow::Result<()> {
         receipts_root: receipts_trie.clone().hash(),
     };
 
+    let block_1_hash =
+        H256::from_str("0x0101010101010101010101010101010101010101010101010101010101010101")?;
+    let mut block_hashes = vec![H256::default(); 256];
+
     let inputs_first = GenerationInputs {
-        signed_txns: vec![txn.to_vec()],
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
         tries: tries_before,
         trie_roots_after: tries_after,
         contract_code,
-        genesis_state_trie_root,
-        block_metadata: block_metadata.clone(),
+        checkpoint_state_trie_root,
+        block_metadata: block_1_metadata.clone(),
         txn_number_before: 0.into(),
         gas_used_before: 0.into(),
         gas_used_after: 21000u64.into(),
-        block_bloom_before: [0.into(); 8],
-        block_bloom_after: [0.into(); 8],
         block_hashes: BlockHashes {
-            prev_hashes: vec![H256::default(); 256],
-            cur_hash: H256::default(),
+            prev_hashes: block_hashes.clone(),
+            cur_hash: block_1_hash,
         },
-        addresses: vec![],
     };
 
     // Preprocess all circuits.
     let all_circuits = AllRecursiveCircuits::<F, C, D>::new(
         &all_stark,
-        &[16..17, 11..13, 17..19, 14..15, 9..11, 12..13, 19..21],
+        &[16..17, 12..15, 14..18, 14..15, 9..10, 12..13, 17..20],
         &config,
     );
 
     let mut timing = TimingTree::new("prove root first", log::Level::Info);
-    let (root_proof_first, first_public_values) =
-        all_circuits.prove_root(&all_stark, &config, inputs_first, &mut timing)?;
+    let (root_proof_first, public_values_first) =
+        all_circuits.prove_root(&all_stark, &config, inputs_first, &mut timing, None)?;
 
     timing.filter(Duration::from_millis(100)).print();
     all_circuits.verify_root(root_proof_first.clone())?;
 
-    // The output bloom filter, gas used and transaction number are fed to the next transaction, so the two proofs can be correctly aggregated.
-    let block_bloom_second = first_public_values.extra_block_data.block_bloom_after;
-    let gas_used_second = first_public_values.extra_block_data.gas_used_after;
+    // The gas used and transaction number are fed to the next transaction, so the two proofs can be correctly aggregated.
+    let gas_used_second = public_values_first.extra_block_data.gas_used_after;
 
     // Prove second transaction. In this second transaction, the code with logs is executed.
 
@@ -559,83 +543,118 @@ fn test_log_with_aggreg() -> anyhow::Result<()> {
 
     transactions_trie.insert(Nibbles::from_str("0x01").unwrap(), txn_2.to_vec());
 
+    let block_1_state_root = expected_state_trie_after.hash();
+
     let trie_roots_after = TrieRoots {
-        state_root: expected_state_trie_after.hash(),
+        state_root: block_1_state_root,
         transactions_root: transactions_trie.hash(),
         receipts_root: receipts_trie.hash(),
     };
 
-    let block_bloom_final = [
-        0.into(),
-        0.into(),
-        U256::from_dec_str(
-            "55213970774324510299479508399853534522527075462195808724319849722937344",
-        )
-        .unwrap(),
-        U256::from_dec_str("1361129467683753853853498429727072845824").unwrap(),
-        U256::from_dec_str("33554432").unwrap(),
-        U256::from_dec_str("9223372036854775808").unwrap(),
-        U256::from_dec_str(
-            "3618502788666131106986593281521497120414687020801267626233049500247285563392",
-        )
-        .unwrap(),
-        U256::from_dec_str("2722259584404615024560450425766186844160").unwrap(),
-    ];
     let inputs = GenerationInputs {
-        signed_txns: vec![txn_2.to_vec()],
+        signed_txn: Some(txn_2.to_vec()),
+        withdrawals: vec![],
         tries: tries_before,
-        trie_roots_after,
+        trie_roots_after: trie_roots_after.clone(),
         contract_code,
-        genesis_state_trie_root,
-        block_metadata,
+        checkpoint_state_trie_root,
+        block_metadata: block_1_metadata,
         txn_number_before: 1.into(),
         gas_used_before: gas_used_second,
         gas_used_after: receipt.cum_gas_used,
-        block_bloom_before: block_bloom_second,
-        block_bloom_after: block_bloom_final,
         block_hashes: BlockHashes {
-            prev_hashes: vec![H256::default(); 256],
-            cur_hash: H256::default(),
+            prev_hashes: block_hashes.clone(),
+            cur_hash: block_1_hash,
         },
-        addresses: vec![],
     };
 
     let mut timing = TimingTree::new("prove root second", log::Level::Info);
-    let (root_proof, public_values) =
-        all_circuits.prove_root(&all_stark, &config, inputs, &mut timing)?;
+    let (root_proof_second, public_values_second) =
+        all_circuits.prove_root(&all_stark, &config, inputs, &mut timing, None.clone())?;
     timing.filter(Duration::from_millis(100)).print();
 
-    all_circuits.verify_root(root_proof.clone())?;
+    all_circuits.verify_root(root_proof_second.clone())?;
+
+    let (agg_proof, updated_agg_public_values) = all_circuits.prove_aggregation(
+        false,
+        &root_proof_first,
+        public_values_first,
+        false,
+        &root_proof_second,
+        public_values_second,
+    )?;
+    all_circuits.verify_aggregation(&agg_proof)?;
+    let (first_block_proof, _block_public_values) =
+        all_circuits.prove_block(None, &agg_proof, updated_agg_public_values)?;
+    all_circuits.verify_block(&first_block_proof)?;
+
+    // Prove the next, empty block.
 
-    // Update public values for the aggregation.
-    let agg_public_values = PublicValues {
-        trie_roots_before: first_public_values.trie_roots_before,
-        trie_roots_after: public_values.trie_roots_after,
-        extra_block_data: ExtraBlockData {
-            genesis_state_trie_root,
-            txn_number_before: first_public_values.extra_block_data.txn_number_before,
-            txn_number_after: public_values.extra_block_data.txn_number_after,
-            gas_used_before: first_public_values.extra_block_data.gas_used_before,
-            gas_used_after: public_values.extra_block_data.gas_used_after,
-            block_bloom_before: first_public_values.extra_block_data.block_bloom_before,
-            block_bloom_after: public_values.extra_block_data.block_bloom_after,
+    let block_2_hash =
+        H256::from_str("0x0123456789101112131415161718192021222324252627282930313233343536")?;
+    block_hashes[255] = block_1_hash;
+
+    let block_2_metadata = BlockMetadata {
+        block_beneficiary: Address::from(beneficiary),
+        block_timestamp: 0x03e8.into(),
+        block_number: 2.into(),
+        block_difficulty: 0x020000.into(),
+        block_gaslimit: 0x445566u32.into(),
+        block_chain_id: 1.into(),
+        block_base_fee: 0xa.into(),
+        ..Default::default()
+    };
+
+    let mut contract_code = HashMap::new();
+    contract_code.insert(keccak(vec![]), vec![]);
+
+    let inputs = GenerationInputs {
+        signed_txn: None,
+        withdrawals: vec![],
+        tries: TrieInputs {
+            state_trie: expected_state_trie_after,
+            transactions_trie: Node::Empty.into(),
+            receipts_trie: Node::Empty.into(),
+            storage_tries: vec![],
+        },
+        trie_roots_after: TrieRoots {
+            state_root: trie_roots_after.state_root,
+            transactions_root: HashedPartialTrie::from(Node::Empty).hash(),
+            receipts_root: HashedPartialTrie::from(Node::Empty).hash(),
+        },
+        contract_code,
+        checkpoint_state_trie_root: block_1_state_root, // We use block 1 as new checkpoint.
+        block_metadata: block_2_metadata,
+        txn_number_before: 0.into(),
+        gas_used_before: 0.into(),
+        gas_used_after: 0.into(),
+        block_hashes: BlockHashes {
+            prev_hashes: block_hashes,
+            cur_hash: block_2_hash,
         },
-        block_metadata: public_values.block_metadata,
-        block_hashes: public_values.block_hashes,
     };
 
-    // We can duplicate the proofs here because the state hasn't mutated.
+    let (root_proof, public_values) =
+        all_circuits.prove_root(&all_stark, &config, inputs, &mut timing, None)?;
+    all_circuits.verify_root(root_proof.clone())?;
+
+    // We can just duplicate the initial proof as the state didn't change.
     let (agg_proof, updated_agg_public_values) = all_circuits.prove_aggregation(
         false,
-        &root_proof_first,
+        &root_proof,
+        public_values.clone(),
         false,
         &root_proof,
-        agg_public_values,
+        public_values,
     )?;
     all_circuits.verify_aggregation(&agg_proof)?;
-    let (block_proof, _block_public_values) =
-        all_circuits.prove_block(None, &agg_proof, updated_agg_public_values)?;
-    all_circuits.verify_block(&block_proof)
+
+    let (second_block_proof, _block_public_values) = all_circuits.prove_block(
+        None, // We don't specify a previous proof, considering block 1 as the new checkpoint.
+        &agg_proof,
+        updated_agg_public_values,
+    )?;
+    all_circuits.verify_block(&second_block_proof)
 }
 
 /// Values taken from the block 1000000 of Goerli: https://goerli.etherscan.io/txs?block=1000000
@@ -752,184 +771,6 @@ fn test_txn_and_receipt_trie_hash() -> anyhow::Result<()> {
     Ok(())
 }
 
-#[test]
-#[ignore] // Too slow to run on CI.
-fn test_two_txn() -> anyhow::Result<()> {
-    init_logger();
-
-    let all_stark = AllStark::<F, D>::default();
-    let config = StarkConfig::standard_fast_config();
-
-    let beneficiary = hex!("2adc25665018aa1fe0e6bc666dac8fc2697ff9ba");
-    let sender = hex!("af1276cbb260bb13deddb4209ae99ae6e497f446");
-    // Private key: DCDFF53B4F013DBCDC717F89FE3BF4D8B10512AAE282B48E01D7530470382701
-    let to = hex!("095e7baea6a6c7c4c2dfeb977efac326af552d87");
-
-    let beneficiary_state_key = keccak(beneficiary);
-    let sender_state_key = keccak(sender);
-    let to_hashed = keccak(to);
-
-    let beneficiary_nibbles = Nibbles::from_bytes_be(beneficiary_state_key.as_bytes()).unwrap();
-    let sender_nibbles = Nibbles::from_bytes_be(sender_state_key.as_bytes()).unwrap();
-    let to_nibbles = Nibbles::from_bytes_be(to_hashed.as_bytes()).unwrap();
-
-    // Set accounts before the transaction.
-    let beneficiary_account_before = AccountRlp {
-        nonce: 1.into(),
-        ..AccountRlp::default()
-    };
-
-    let sender_balance_before = 50000000000000000u64;
-    let sender_account_before = AccountRlp {
-        balance: sender_balance_before.into(),
-        ..AccountRlp::default()
-    };
-    let to_account_before = AccountRlp {
-        ..AccountRlp::default()
-    };
-
-    // Initialize the state trie with three accounts.
-    let mut state_trie_before = HashedPartialTrie::from(Node::Empty);
-    state_trie_before.insert(
-        beneficiary_nibbles,
-        rlp::encode(&beneficiary_account_before).to_vec(),
-    );
-    state_trie_before.insert(sender_nibbles, rlp::encode(&sender_account_before).to_vec());
-    state_trie_before.insert(to_nibbles, rlp::encode(&to_account_before).to_vec());
-
-    let tries_before = TrieInputs {
-        state_trie: state_trie_before,
-        transactions_trie: Node::Empty.into(),
-        receipts_trie: Node::Empty.into(),
-        storage_tries: vec![(to_hashed, Node::Empty.into())],
-    };
-
-    // Prove two simple transfers.
-    let gas_price = 10;
-    let txn_value = 0x11c37937e08000u64;
-    let txn_0 = hex!("f866800a82520894095e7baea6a6c7c4c2dfeb977efac326af552d878711c37937e080008026a01fcd0ce88ac7600698a771f206df24b70e67981b6f107bd7c1c24ea94f113bcba00d87cc5c7afc2988e4ff200b5a0c7016b0d5498bbc692065ca983fcbbfe02555");
-    let txn_1 = hex!("f866010a82520894095e7baea6a6c7c4c2dfeb977efac326af552d878711c37937e080008026a0d8123f5f537bd3a67283f67eb136f7accdfc4ef012cfbfd3fb1d0ac7fd01b96fa004666d9feef90a1eb568570374dd19977d4da231b289d769e6f95105c06fd672");
-
-    let block_metadata = BlockMetadata {
-        block_beneficiary: Address::from(beneficiary),
-        block_timestamp: 0x03e8.into(),
-        block_number: 1.into(),
-        block_difficulty: 0x020000.into(),
-        block_random: H256::from_uint(&0x020000.into()),
-        block_gaslimit: 0xffffffffu32.into(),
-        block_chain_id: 1.into(),
-        block_base_fee: 0xa.into(),
-        block_gas_used: 0.into(),
-        block_blob_base_fee: 0x2.into(),
-        block_bloom: [0.into(); 8],
-    };
-
-    let mut contract_code = HashMap::new();
-    contract_code.insert(keccak(vec![]), vec![]);
-
-    // Update accounts
-    let beneficiary_account_after = AccountRlp {
-        nonce: 1.into(),
-        ..AccountRlp::default()
-    };
-
-    let sender_balance_after = sender_balance_before - gas_price * 21000 * 2 - txn_value * 2;
-    let sender_account_after = AccountRlp {
-        balance: sender_balance_after.into(),
-        nonce: 2.into(),
-        ..AccountRlp::default()
-    };
-    let to_account_after = AccountRlp {
-        balance: (2 * txn_value).into(),
-        ..AccountRlp::default()
-    };
-
-    // Update the state trie.
-    let mut expected_state_trie_after = HashedPartialTrie::from(Node::Empty);
-    expected_state_trie_after.insert(
-        beneficiary_nibbles,
-        rlp::encode(&beneficiary_account_after).to_vec(),
-    );
-    expected_state_trie_after.insert(sender_nibbles, rlp::encode(&sender_account_after).to_vec());
-    expected_state_trie_after.insert(to_nibbles, rlp::encode(&to_account_after).to_vec());
-
-    // Compute new receipt trie.
-    let mut receipts_trie = HashedPartialTrie::from(Node::Empty);
-
-    let receipt_0 = LegacyReceiptRlp {
-        status: true,
-        cum_gas_used: 21000u64.into(),
-        bloom: [0x00; 256].to_vec().into(),
-        logs: vec![],
-    };
-
-    let receipt_1 = LegacyReceiptRlp {
-        status: true,
-        cum_gas_used: 42000u64.into(),
-        bloom: [0x00; 256].to_vec().into(),
-        logs: vec![],
-    };
-
-    receipts_trie.insert(
-        Nibbles::from_str("0x80").unwrap(),
-        rlp::encode(&receipt_0).to_vec(),
-    );
-
-    receipts_trie.insert(
-        Nibbles::from_str("0x01").unwrap(),
-        rlp::encode(&receipt_1).to_vec(),
-    );
-
-    let mut transactions_trie: HashedPartialTrie = Node::Leaf {
-        nibbles: Nibbles::from_str("0x80").unwrap(),
-        value: txn_0.to_vec(),
-    }
-    .into();
-
-    transactions_trie.insert(Nibbles::from_str("0x01").unwrap(), txn_1.to_vec());
-
-    let trie_roots_after = TrieRoots {
-        state_root: expected_state_trie_after.hash(),
-        transactions_root: transactions_trie.hash(),
-        receipts_root: receipts_trie.hash(),
-    };
-    let inputs = GenerationInputs {
-        signed_txns: vec![txn_0.to_vec(), txn_1.to_vec()],
-        tries: tries_before,
-        trie_roots_after,
-        contract_code,
-        genesis_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
-        block_metadata,
-        txn_number_before: 0.into(),
-        gas_used_before: 0.into(),
-        gas_used_after: 42000u64.into(),
-        block_bloom_before: [0.into(); 8],
-        block_bloom_after: [0.into(); 8],
-        block_hashes: BlockHashes {
-            prev_hashes: vec![H256::default(); 256],
-            cur_hash: H256::default(),
-        },
-        addresses: vec![],
-    };
-
-    let mut timing = TimingTree::new("prove", log::Level::Debug);
-    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing)?;
-    timing.filter(Duration::from_millis(100)).print();
-
-    // Assert trie roots.
-    assert_eq!(
-        proof.public_values.trie_roots_after.state_root,
-        expected_state_trie_after.hash()
-    );
-
-    assert_eq!(
-        proof.public_values.trie_roots_after.receipts_root,
-        receipts_trie.hash()
-    );
-
-    verify_proof(&all_stark, proof, &config)
-}
-
 fn init_logger() {
     let _ = try_init_from_env(Env::default().filter_or(DEFAULT_FILTER_ENV, "info"));
 }
diff --git a/evm/tests/many_transactions.rs b/evm/tests/many_transactions.rs
deleted file mode 100644
index 9678d652d3..0000000000
--- a/evm/tests/many_transactions.rs
+++ /dev/null
@@ -1,246 +0,0 @@
-#![allow(clippy::upper_case_acronyms)]
-
-use std::collections::HashMap;
-use std::str::FromStr;
-use std::time::Duration;
-
-use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
-use eth_trie_utils::nibbles::Nibbles;
-use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
-use ethereum_types::{Address, H256, U256};
-use hex_literal::hex;
-use keccak_hash::keccak;
-use plonky2::field::goldilocks_field::GoldilocksField;
-use plonky2::plonk::config::KeccakGoldilocksConfig;
-use plonky2::util::timing::TimingTree;
-use plonky2_evm::all_stark::AllStark;
-use plonky2_evm::config::StarkConfig;
-use plonky2_evm::cpu::kernel::opcodes::{get_opcode, get_push_opcode};
-use plonky2_evm::generation::mpt::{AccountRlp, LegacyReceiptRlp};
-use plonky2_evm::generation::{GenerationInputs, TrieInputs};
-use plonky2_evm::proof::{BlockHashes, BlockMetadata, TrieRoots};
-use plonky2_evm::prover::prove;
-use plonky2_evm::verifier::verify_proof;
-use plonky2_evm::Node;
-
-type F = GoldilocksField;
-const D: usize = 2;
-type C = KeccakGoldilocksConfig;
-
-/// Test the validity of four transactions, where only the first one is valid and the other three abort.  
-#[test]
-fn test_four_transactions() -> anyhow::Result<()> {
-    init_logger();
-
-    let all_stark = AllStark::<F, D>::default();
-    let config = StarkConfig::standard_fast_config();
-
-    let beneficiary = hex!("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef");
-    let sender = hex!("2c7536e3605d9c16a7a3d7b1898e529396a65c23");
-    let to = hex!("a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0");
-
-    let beneficiary_state_key = keccak(beneficiary);
-    let sender_state_key = keccak(sender);
-    let to_state_key = keccak(to);
-
-    let beneficiary_nibbles = Nibbles::from_bytes_be(beneficiary_state_key.as_bytes()).unwrap();
-    let sender_nibbles = Nibbles::from_bytes_be(sender_state_key.as_bytes()).unwrap();
-    let to_nibbles = Nibbles::from_bytes_be(to_state_key.as_bytes()).unwrap();
-
-    let push1 = get_push_opcode(1);
-    let add = get_opcode("ADD");
-    let stop = get_opcode("STOP");
-    let code = [push1, 3, push1, 4, add, stop];
-    let code_gas = 3 + 3 + 3;
-    let code_hash = keccak(code);
-
-    let beneficiary_account_before = AccountRlp::default();
-    let sender_account_before = AccountRlp {
-        nonce: 5.into(),
-
-        balance: eth_to_wei(100_000.into()),
-
-        ..AccountRlp::default()
-    };
-    let to_account_before = AccountRlp {
-        code_hash,
-        ..AccountRlp::default()
-    };
-
-    let state_trie_before = {
-        let mut children = core::array::from_fn(|_| Node::Empty.into());
-        children[sender_nibbles.get_nibble(0) as usize] = Node::Leaf {
-            nibbles: sender_nibbles.truncate_n_nibbles_front(1),
-
-            value: rlp::encode(&sender_account_before).to_vec(),
-        }
-        .into();
-        children[to_nibbles.get_nibble(0) as usize] = Node::Leaf {
-            nibbles: to_nibbles.truncate_n_nibbles_front(1),
-
-            value: rlp::encode(&to_account_before).to_vec(),
-        }
-        .into();
-        Node::Branch {
-            children,
-            value: vec![],
-        }
-    }
-    .into();
-
-    let tries_before = TrieInputs {
-        state_trie: state_trie_before,
-        transactions_trie: Node::Empty.into(),
-        receipts_trie: Node::Empty.into(),
-        storage_tries: vec![],
-    };
-
-    // Generated using a little py-evm script.
-    let txn1 = hex!("f861050a8255f094a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0648242421ba02c89eb757d9deeb1f5b3859a9d4d679951ef610ac47ad4608dc142beb1b7e313a05af7e9fbab825455d36c36c7f4cfcafbeafa9a77bdff936b52afb36d4fe4bcdd");
-    let txn2 = hex!("f863800a83061a8094095e7baea6a6c7c4c2dfeb977efac326af552d87830186a0801ba0ffb600e63115a7362e7811894a91d8ba4330e526f22121c994c4692035dfdfd5a06198379fcac8de3dbfac48b165df4bf88e2088f294b61efb9a65fe2281c76e16");
-    let txn3 = hex!("f861800a8405f5e10094100000000000000000000000000000000000000080801ba07e09e26678ed4fac08a249ebe8ed680bf9051a5e14ad223e4b2b9d26e0208f37a05f6e3f188e3e6eab7d7d3b6568f5eac7d687b08d307d3154ccd8c87b4630509b");
-    let txn4 = hex!("f866800a82520894095e7baea6a6c7c4c2dfeb977efac326af552d878711c37937e080008026a01fcd0ce88ac7600698a771f206df24b70e67981b6f107bd7c1c24ea94f113bcba00d87cc5c7afc2988e4ff200b5a0c7016b0d5498bbc692065ca983fcbbfe02555");
-
-    let txdata_gas = 2 * 16;
-    let gas_used = 21_000 + code_gas + txdata_gas;
-
-    let value = U256::from(100u32);
-
-    let block_metadata = BlockMetadata {
-        block_beneficiary: Address::from(beneficiary),
-        block_timestamp: 0x03e8.into(),
-        block_number: 1.into(),
-        block_difficulty: 0x020000.into(),
-        block_gaslimit: 0x445566u64.into(),
-        block_chain_id: 1.into(),
-        block_gas_used: gas_used.into(),
-        ..BlockMetadata::default()
-    };
-
-    let mut contract_code = HashMap::new();
-    contract_code.insert(keccak(vec![]), vec![]);
-    contract_code.insert(code_hash, code.to_vec());
-
-    // Update trie roots after the 4 transactions.
-    // State trie.
-    let expected_state_trie_after: HashedPartialTrie = {
-        let beneficiary_account_after = AccountRlp {
-            balance: beneficiary_account_before.balance + gas_used * 10,
-            ..beneficiary_account_before
-        };
-        let sender_account_after = AccountRlp {
-            balance: sender_account_before.balance - value - gas_used * 10,
-            nonce: sender_account_before.nonce + 1,
-            ..sender_account_before
-        };
-        let to_account_after = AccountRlp {
-            balance: to_account_before.balance + value,
-            ..to_account_before
-        };
-
-        let mut children = core::array::from_fn(|_| Node::Empty.into());
-        children[beneficiary_nibbles.get_nibble(0) as usize] = Node::Leaf {
-            nibbles: beneficiary_nibbles.truncate_n_nibbles_front(1),
-
-            value: rlp::encode(&beneficiary_account_after).to_vec(),
-        }
-        .into();
-        children[sender_nibbles.get_nibble(0) as usize] = Node::Leaf {
-            nibbles: sender_nibbles.truncate_n_nibbles_front(1),
-
-            value: rlp::encode(&sender_account_after).to_vec(),
-        }
-        .into();
-        children[to_nibbles.get_nibble(0) as usize] = Node::Leaf {
-            nibbles: to_nibbles.truncate_n_nibbles_front(1),
-
-            value: rlp::encode(&to_account_after).to_vec(),
-        }
-        .into();
-        Node::Branch {
-            children,
-            value: vec![],
-        }
-    }
-    .into();
-
-    // Transactions trie.
-    let mut transactions_trie: HashedPartialTrie = Node::Leaf {
-        nibbles: Nibbles::from_str("0x80").unwrap(),
-        value: txn1.to_vec(),
-    }
-    .into();
-    transactions_trie.insert(Nibbles::from_str("0x01").unwrap(), txn2.to_vec());
-    transactions_trie.insert(Nibbles::from_str("0x02").unwrap(), txn3.to_vec());
-    transactions_trie.insert(Nibbles::from_str("0x03").unwrap(), txn4.to_vec());
-
-    // Receipts trie.
-    let mut receipts_trie = HashedPartialTrie::from(Node::Empty);
-    let receipt_0 = LegacyReceiptRlp {
-        status: true,
-        cum_gas_used: gas_used.into(),
-        bloom: [0x00; 256].to_vec().into(),
-        logs: vec![],
-    };
-    let receipt_1 = LegacyReceiptRlp {
-        status: false,
-        cum_gas_used: gas_used.into(),
-        bloom: [0x00; 256].to_vec().into(),
-        logs: vec![],
-    };
-    receipts_trie.insert(
-        Nibbles::from_str("0x80").unwrap(),
-        rlp::encode(&receipt_0).to_vec(),
-    );
-    receipts_trie.insert(
-        Nibbles::from_str("0x01").unwrap(),
-        rlp::encode(&receipt_1).to_vec(),
-    );
-    receipts_trie.insert(
-        Nibbles::from_str("0x02").unwrap(),
-        rlp::encode(&receipt_1).to_vec(),
-    );
-    receipts_trie.insert(
-        Nibbles::from_str("0x03").unwrap(),
-        rlp::encode(&receipt_1).to_vec(),
-    );
-
-    let trie_roots_after = TrieRoots {
-        state_root: expected_state_trie_after.hash(),
-        transactions_root: transactions_trie.hash(),
-        receipts_root: receipts_trie.hash(),
-    };
-    let inputs = GenerationInputs {
-        signed_txns: vec![txn1.to_vec(), txn2.to_vec(), txn3.to_vec(), txn4.to_vec()],
-        tries: tries_before,
-        trie_roots_after,
-        genesis_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
-        contract_code,
-        block_metadata,
-        addresses: vec![],
-        block_bloom_before: [0.into(); 8],
-        gas_used_before: 0.into(),
-        gas_used_after: gas_used.into(),
-        txn_number_before: 0.into(),
-        block_bloom_after: [0.into(); 8],
-        block_hashes: BlockHashes {
-            prev_hashes: vec![H256::default(); 256],
-            cur_hash: H256::default(),
-        },
-    };
-
-    let mut timing = TimingTree::new("prove", log::Level::Debug);
-    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing)?;
-    timing.filter(Duration::from_millis(100)).print();
-
-    verify_proof(&all_stark, proof, &config)
-}
-
-fn eth_to_wei(eth: U256) -> U256 {
-    // 1 ether = 10^18 wei.
-    eth * U256::from(10).pow(18.into())
-}
-
-fn init_logger() {
-    let _ = try_init_from_env(Env::default().filter_or(DEFAULT_FILTER_ENV, "info"));
-}
diff --git a/evm/tests/self_balance_gas_cost.rs b/evm/tests/self_balance_gas_cost.rs
index 6da44ef452..c383d89c9b 100644
--- a/evm/tests/self_balance_gas_cost.rs
+++ b/evm/tests/self_balance_gas_cost.rs
@@ -171,26 +171,24 @@ fn self_balance_gas_cost() -> anyhow::Result<()> {
         receipts_root: receipts_trie.hash(),
     };
     let inputs = GenerationInputs {
-        signed_txns: vec![txn.to_vec()],
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
         tries: tries_before,
         trie_roots_after,
         contract_code,
-        genesis_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
         block_metadata,
         txn_number_before: 0.into(),
         gas_used_before: 0.into(),
         gas_used_after: gas_used.into(),
-        block_bloom_before: [0.into(); 8],
-        block_bloom_after: [0.into(); 8],
         block_hashes: BlockHashes {
             prev_hashes: vec![H256::default(); 256],
             cur_hash: H256::default(),
         },
-        addresses: vec![],
     };
 
     let mut timing = TimingTree::new("prove", log::Level::Debug);
-    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing)?;
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
     timing.filter(Duration::from_millis(100)).print();
 
     verify_proof(&all_stark, proof, &config)
diff --git a/evm/tests/selfdestruct.rs b/evm/tests/selfdestruct.rs
new file mode 100644
index 0000000000..d075c731d3
--- /dev/null
+++ b/evm/tests/selfdestruct.rs
@@ -0,0 +1,165 @@
+use std::str::FromStr;
+use std::time::Duration;
+
+use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
+use eth_trie_utils::nibbles::Nibbles;
+use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
+use ethereum_types::{Address, BigEndianHash, H256, U256};
+use hex_literal::hex;
+use keccak_hash::keccak;
+use plonky2::field::goldilocks_field::GoldilocksField;
+use plonky2::plonk::config::KeccakGoldilocksConfig;
+use plonky2::util::timing::TimingTree;
+use plonky2_evm::all_stark::AllStark;
+use plonky2_evm::config::StarkConfig;
+use plonky2_evm::generation::mpt::{AccountRlp, LegacyReceiptRlp};
+use plonky2_evm::generation::{GenerationInputs, TrieInputs};
+use plonky2_evm::proof::{BlockHashes, BlockMetadata, TrieRoots};
+use plonky2_evm::prover::prove;
+use plonky2_evm::verifier::verify_proof;
+use plonky2_evm::Node;
+
+type F = GoldilocksField;
+const D: usize = 2;
+type C = KeccakGoldilocksConfig;
+
+/// Test a simple selfdestruct.
+#[test]
+fn test_selfdestruct() -> anyhow::Result<()> {
+    init_logger();
+
+    let all_stark = AllStark::<F, D>::default();
+    let config = StarkConfig::standard_fast_config();
+
+    let beneficiary = hex!("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef");
+    let sender = hex!("5eb96AA102a29fAB267E12A40a5bc6E9aC088759");
+    let to = hex!("a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0");
+
+    let sender_state_key = keccak(sender);
+    let to_state_key = keccak(to);
+
+    let sender_nibbles = Nibbles::from_bytes_be(sender_state_key.as_bytes()).unwrap();
+    let to_nibbles = Nibbles::from_bytes_be(to_state_key.as_bytes()).unwrap();
+
+    let sender_account_before = AccountRlp {
+        nonce: 5.into(),
+        balance: eth_to_wei(100_000.into()),
+        storage_root: HashedPartialTrie::from(Node::Empty).hash(),
+        code_hash: keccak([]),
+    };
+    let code = vec![
+        0x32, // ORIGIN
+        0xFF, // SELFDESTRUCT
+    ];
+    let to_account_before = AccountRlp {
+        nonce: 12.into(),
+        balance: eth_to_wei(10_000.into()),
+        storage_root: HashedPartialTrie::from(Node::Empty).hash(),
+        code_hash: keccak(&code),
+    };
+
+    let mut state_trie_before = HashedPartialTrie::from(Node::Empty);
+    state_trie_before.insert(sender_nibbles, rlp::encode(&sender_account_before).to_vec());
+    state_trie_before.insert(to_nibbles, rlp::encode(&to_account_before).to_vec());
+
+    let tries_before = TrieInputs {
+        state_trie: state_trie_before,
+        transactions_trie: HashedPartialTrie::from(Node::Empty),
+        receipts_trie: HashedPartialTrie::from(Node::Empty),
+        storage_tries: vec![],
+    };
+
+    // Generated using a little py-evm script.
+    let txn = hex!("f868050a831e848094a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0880de0b6b3a76400008025a09bab8db7d72e4b42cba8b117883e16872966bae8e4570582de6ed0065e8c36a1a01256d44d982c75e0ab7a19f61ab78afa9e089d51c8686fdfbee085a5ed5d8ff8");
+
+    let block_metadata = BlockMetadata {
+        block_beneficiary: Address::from(beneficiary),
+        block_timestamp: 0x03e8.into(),
+        block_number: 1.into(),
+        block_difficulty: 0x020000.into(),
+        block_random: H256::from_uint(&0x020000.into()),
+        block_gaslimit: 0xff112233u32.into(),
+        block_chain_id: 1.into(),
+        block_base_fee: 0xa.into(),
+        block_gas_used: 26002.into(),
+        block_blob_base_fee: 0x2.into(),
+        block_bloom: [0.into(); 8],
+    };
+
+    let contract_code = [(keccak(&code), code.clone()), (keccak([]), vec![])].into();
+
+    let expected_state_trie_after: HashedPartialTrie = {
+        let mut state_trie_after = HashedPartialTrie::from(Node::Empty);
+        let sender_account_after = AccountRlp {
+            nonce: 6.into(),
+            balance: eth_to_wei(110_000.into()) - 26_002 * 0xa,
+            storage_root: HashedPartialTrie::from(Node::Empty).hash(),
+            code_hash: keccak([]),
+        };
+        state_trie_after.insert(sender_nibbles, rlp::encode(&sender_account_after).to_vec());
+
+        // EIP-6780: The account won't be deleted because it wasn't created during this transaction.
+        let to_account_before = AccountRlp {
+            nonce: 12.into(),
+            balance: 0.into(),
+            storage_root: HashedPartialTrie::from(Node::Empty).hash(),
+            code_hash: keccak(&code),
+        };
+        state_trie_after.insert(to_nibbles, rlp::encode(&to_account_before).to_vec());
+        state_trie_after
+    };
+
+    let receipt_0 = LegacyReceiptRlp {
+        status: true,
+        cum_gas_used: 26002.into(),
+        bloom: vec![0; 256].into(),
+        logs: vec![],
+    };
+    let mut receipts_trie = HashedPartialTrie::from(Node::Empty);
+    receipts_trie.insert(
+        Nibbles::from_str("0x80").unwrap(),
+        rlp::encode(&receipt_0).to_vec(),
+    );
+    let transactions_trie: HashedPartialTrie = Node::Leaf {
+        nibbles: Nibbles::from_str("0x80").unwrap(),
+        value: txn.to_vec(),
+    }
+    .into();
+
+    let trie_roots_after = TrieRoots {
+        state_root: expected_state_trie_after.hash(),
+        transactions_root: transactions_trie.hash(),
+        receipts_root: receipts_trie.hash(),
+    };
+    let inputs = GenerationInputs {
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
+        tries: tries_before,
+        trie_roots_after,
+        contract_code,
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        block_metadata,
+        txn_number_before: 0.into(),
+        gas_used_before: 0.into(),
+        gas_used_after: 26002.into(),
+        block_hashes: BlockHashes {
+            prev_hashes: vec![H256::default(); 256],
+            cur_hash: H256::default(),
+        },
+    };
+
+    let mut timing = TimingTree::new("prove", log::Level::Debug);
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
+    timing.filter(Duration::from_millis(100)).print();
+
+    verify_proof(&all_stark, proof, &config)
+}
+
+fn eth_to_wei(eth: U256) -> U256 {
+    // 1 ether = 10^18 wei.
+    eth * U256::from(10).pow(18.into())
+}
+
+fn init_logger() {
+    let _ = try_init_from_env(Env::default().filter_or(DEFAULT_FILTER_ENV, "info"));
+}
diff --git a/evm/tests/simple_transfer.rs b/evm/tests/simple_transfer.rs
index ccf91d4a26..707a2e2ce0 100644
--- a/evm/tests/simple_transfer.rs
+++ b/evm/tests/simple_transfer.rs
@@ -139,26 +139,24 @@ fn test_simple_transfer() -> anyhow::Result<()> {
         receipts_root: receipts_trie.hash(),
     };
     let inputs = GenerationInputs {
-        signed_txns: vec![txn.to_vec()],
+        signed_txn: Some(txn.to_vec()),
+        withdrawals: vec![],
         tries: tries_before,
         trie_roots_after,
         contract_code,
-        genesis_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
         block_metadata,
         txn_number_before: 0.into(),
         gas_used_before: 0.into(),
         gas_used_after: 21032.into(),
-        block_bloom_before: [0.into(); 8],
-        block_bloom_after: [0.into(); 8],
         block_hashes: BlockHashes {
             prev_hashes: vec![H256::default(); 256],
             cur_hash: H256::default(),
         },
-        addresses: vec![],
     };
 
     let mut timing = TimingTree::new("prove", log::Level::Debug);
-    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing)?;
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
     timing.filter(Duration::from_millis(100)).print();
 
     verify_proof(&all_stark, proof, &config)
diff --git a/evm/tests/withdrawals.rs b/evm/tests/withdrawals.rs
new file mode 100644
index 0000000000..ef2d19b02a
--- /dev/null
+++ b/evm/tests/withdrawals.rs
@@ -0,0 +1,96 @@
+use std::collections::HashMap;
+use std::time::Duration;
+
+use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
+use eth_trie_utils::nibbles::Nibbles;
+use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
+use ethereum_types::{H160, H256, U256};
+use keccak_hash::keccak;
+use plonky2::field::goldilocks_field::GoldilocksField;
+use plonky2::plonk::config::PoseidonGoldilocksConfig;
+use plonky2::util::timing::TimingTree;
+use plonky2_evm::all_stark::AllStark;
+use plonky2_evm::config::StarkConfig;
+use plonky2_evm::generation::mpt::AccountRlp;
+use plonky2_evm::generation::{GenerationInputs, TrieInputs};
+use plonky2_evm::proof::{BlockHashes, BlockMetadata, TrieRoots};
+use plonky2_evm::prover::prove;
+use plonky2_evm::verifier::verify_proof;
+use plonky2_evm::Node;
+use rand::random;
+
+type F = GoldilocksField;
+const D: usize = 2;
+type C = PoseidonGoldilocksConfig;
+
+/// Execute 0 txns and 1 withdrawal.
+#[test]
+fn test_withdrawals() -> anyhow::Result<()> {
+    init_logger();
+
+    let all_stark = AllStark::<F, D>::default();
+    let config = StarkConfig::standard_fast_config();
+
+    let block_metadata = BlockMetadata::default();
+
+    let state_trie_before = HashedPartialTrie::from(Node::Empty);
+    let transactions_trie = HashedPartialTrie::from(Node::Empty);
+    let receipts_trie = HashedPartialTrie::from(Node::Empty);
+    let storage_tries = vec![];
+
+    let mut contract_code = HashMap::new();
+    contract_code.insert(keccak(vec![]), vec![]);
+
+    // Just one withdrawal.
+    let withdrawals = vec![(H160(random()), U256(random()))];
+
+    let state_trie_after = {
+        let mut trie = HashedPartialTrie::from(Node::Empty);
+        let addr_state_key = keccak(withdrawals[0].0);
+        let addr_nibbles = Nibbles::from_bytes_be(addr_state_key.as_bytes()).unwrap();
+        let account = AccountRlp {
+            balance: withdrawals[0].1,
+            ..AccountRlp::default()
+        };
+        trie.insert(addr_nibbles, rlp::encode(&account).to_vec());
+        trie
+    };
+
+    let trie_roots_after = TrieRoots {
+        state_root: state_trie_after.hash(),
+        transactions_root: transactions_trie.hash(),
+        receipts_root: receipts_trie.hash(),
+    };
+
+    let inputs = GenerationInputs {
+        signed_txn: None,
+        withdrawals,
+        tries: TrieInputs {
+            state_trie: state_trie_before,
+            transactions_trie,
+            receipts_trie,
+            storage_tries,
+        },
+        trie_roots_after,
+        contract_code,
+        checkpoint_state_trie_root: HashedPartialTrie::from(Node::Empty).hash(),
+        block_metadata,
+        txn_number_before: 0.into(),
+        gas_used_before: 0.into(),
+        gas_used_after: 0.into(),
+        block_hashes: BlockHashes {
+            prev_hashes: vec![H256::default(); 256],
+            cur_hash: H256::default(),
+        },
+    };
+
+    let mut timing = TimingTree::new("prove", log::Level::Debug);
+    let proof = prove::<F, C, D>(&all_stark, &config, inputs, &mut timing, None)?;
+    timing.filter(Duration::from_millis(100)).print();
+
+    verify_proof(&all_stark, proof, &config)
+}
+
+fn init_logger() {
+    let _ = try_init_from_env(Env::default().filter_or(DEFAULT_FILTER_ENV, "info"));
+}
diff --git a/field/.cargo/katex-header.html b/field/.cargo/katex-header.html
new file mode 100644
index 0000000000..20723b5d27
--- /dev/null
+++ b/field/.cargo/katex-header.html
@@ -0,0 +1 @@
+../../.cargo/katex-header.html
\ No newline at end of file
diff --git a/field/Cargo.toml b/field/Cargo.toml
index ed5ef27bc2..72408c4946 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -15,3 +15,7 @@ rand = { version = "0.8.5", default-features = false, features = ["getrandom"] }
 serde = { version = "1.0", default-features = false, features = ["alloc", "derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
+
+# Display math equations properly in documentation
+[package.metadata.docs.rs]
+rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
diff --git a/field/src/arch/x86_64/avx2_goldilocks_field.rs b/field/src/arch/x86_64/avx2_goldilocks_field.rs
index ffae8693be..c7e0ec9e02 100644
--- a/field/src/arch/x86_64/avx2_goldilocks_field.rs
+++ b/field/src/arch/x86_64/avx2_goldilocks_field.rs
@@ -82,12 +82,14 @@ impl Default for Avx2GoldilocksField {
 
 impl Div<GoldilocksField> for Avx2GoldilocksField {
     type Output = Self;
+    #[allow(clippy::suspicious_arithmetic_impl)]
     #[inline]
     fn div(self, rhs: GoldilocksField) -> Self {
         self * rhs.inverse()
     }
 }
 impl DivAssign<GoldilocksField> for Avx2GoldilocksField {
+    #[allow(clippy::suspicious_op_assign_impl)]
     #[inline]
     fn div_assign(&mut self, rhs: GoldilocksField) {
         *self *= rhs.inverse();
@@ -318,8 +320,7 @@ unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i {
     let res_wrapped_s = _mm256_add_epi64(x, y_s);
     let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0.
     let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0.
-    let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt);
-    res_s
+    _mm256_add_epi64(res_wrapped_s, wrapback_amt)
 }
 
 #[inline]
@@ -337,8 +338,7 @@ unsafe fn sub(x: __m256i, y: __m256i) -> __m256i {
     let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0.
     let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflow else 0.
     let res_wrapped = _mm256_sub_epi64(x_s, y_s);
-    let res = _mm256_sub_epi64(res_wrapped, wrapback_amt);
-    res
+    _mm256_sub_epi64(res_wrapped, wrapback_amt)
 }
 
 #[inline]
@@ -425,10 +425,9 @@ unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
     // 0xffffffff and the addition of the low 32 bits generated a carry. This can never occur if y
     // <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no carry can occur.
     let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s); // -1 if overflowed else 0.
-                                                       // The mask contains 0xffffffff in the high 32 bits if wraparound occured and 0 otherwise.
+                                                       // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise.
     let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0.
-    let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt);
-    res_s
+    _mm256_add_epi64(res_wrapped_s, wrapback_amt)
 }
 
 /// Goldilocks subtraction of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be
@@ -442,10 +441,9 @@ unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
     // 0xffffffff and the subtraction of the low 32 bits generated a borrow. This can never occur if
     // y <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no borrow can occur.
     let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s); // -1 if underflowed else 0.
-                                                       // The mask contains 0xffffffff in the high 32 bits if wraparound occured and 0 otherwise.
+                                                       // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise.
     let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflowed else 0.
-    let res_s = _mm256_sub_epi64(res_wrapped_s, wrapback_amt);
-    res_s
+    _mm256_sub_epi64(res_wrapped_s, wrapback_amt)
 }
 
 #[inline]
@@ -456,8 +454,7 @@ unsafe fn reduce128(x: (__m256i, __m256i)) -> __m256i {
     let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0);
     let t1 = _mm256_mul_epu32(hi0, EPSILON);
     let lo2_s = add_small_64s_64_s(lo1_s, t1);
-    let lo2 = shift(lo2_s);
-    lo2
+    shift(lo2_s)
 }
 
 /// Multiply two integers modulo FIELD_ORDER.
@@ -628,6 +625,7 @@ mod tests {
         }
     }
 
+    #[allow(clippy::zero_prefixed_literal)]
     #[test]
     fn test_interleave() {
         let in_a: [GoldilocksField; 4] = [
diff --git a/field/src/batch_util.rs b/field/src/batch_util.rs
index 4338b7e422..ab7ee3d507 100644
--- a/field/src/batch_util.rs
+++ b/field/src/batch_util.rs
@@ -2,7 +2,7 @@ use crate::packable::Packable;
 use crate::packed::PackedField;
 use crate::types::Field;
 
-fn pack_with_leftovers_split_point<P: PackedField>(slice: &[P::Scalar]) -> usize {
+const fn pack_with_leftovers_split_point<P: PackedField>(slice: &[P::Scalar]) -> usize {
     let n = slice.len();
     let n_leftover = n % P::WIDTH;
     n - n_leftover
diff --git a/field/src/extension/algebra.rs b/field/src/extension/algebra.rs
index 8ca939b228..f7ca3caeb9 100644
--- a/field/src/extension/algebra.rs
+++ b/field/src/extension/algebra.rs
@@ -17,11 +17,11 @@ impl<F: OEF<D>, const D: usize> ExtensionAlgebra<F, D> {
         F::ONE.into()
     }
 
-    pub fn from_basefield_array(arr: [F; D]) -> Self {
+    pub const fn from_basefield_array(arr: [F; D]) -> Self {
         Self(arr)
     }
 
-    pub fn to_basefield_array(self) -> [F; D] {
+    pub const fn to_basefield_array(self) -> [F; D] {
         self.0
     }
 
diff --git a/field/src/extension/mod.rs b/field/src/extension/mod.rs
index bbbaca25e5..3586055e3f 100644
--- a/field/src/extension/mod.rs
+++ b/field/src/extension/mod.rs
@@ -15,7 +15,7 @@ pub trait OEF<const D: usize>: FieldExtension<D> {
     // Element W of BaseField, such that `X^d - W` is irreducible over BaseField.
     const W: Self::BaseField;
 
-    // Element of BaseField such that DTH_ROOT^D == 1. Implementors
+    // Element of BaseField such that DTH_ROOT^D == 1. Implementers
     // should set this to W^((p - 1)/D), where W is as above and p is
     // the order of the BaseField.
     const DTH_ROOT: Self::BaseField;
diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
index 8b53f8b5f7..6dd15ce0d7 100644
--- a/field/src/goldilocks_extensions.rs
+++ b/field/src/goldilocks_extensions.rs
@@ -114,14 +114,14 @@ impl Mul for QuinticExtension<GoldilocksField> {
 
 /// Return `a`, `b` such that `a + b*2^128 = 3*(x + y*2^128)` with `a < 2^128` and `b < 2^32`.
 #[inline(always)]
-fn u160_times_3(x: u128, y: u32) -> (u128, u32) {
+const fn u160_times_3(x: u128, y: u32) -> (u128, u32) {
     let (s, cy) = x.overflowing_add(x << 1);
     (s, 3 * y + (x >> 127) as u32 + cy as u32)
 }
 
 /// Return `a`, `b` such that `a + b*2^128 = 7*(x + y*2^128)` with `a < 2^128` and `b < 2^32`.
 #[inline(always)]
-fn u160_times_7(x: u128, y: u32) -> (u128, u32) {
+const fn u160_times_7(x: u128, y: u32) -> (u128, u32) {
     let (d, br) = (x << 3).overflowing_sub(x);
     // NB: subtracting the borrow can't underflow
     (d, 7 * y + (x >> (128 - 3)) as u32 - br as u32)
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index 6e4361fccc..4e459c9082 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -3,7 +3,7 @@ use core::hash::{Hash, Hasher};
 use core::iter::{Product, Sum};
 use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
 
-use num::{BigUint, Integer};
+use num::{BigUint, Integer, ToPrimitive};
 use plonky2_util::{assume, branch_hint};
 use serde::{Deserialize, Serialize};
 
@@ -104,7 +104,7 @@ impl Field for GoldilocksField {
     /// Therefore      $a^(p-2)     = a^-1 (mod p)$
     ///
     /// The following code has been adapted from winterfell/math/src/field/f64/mod.rs
-    /// located at https://github.com/facebook/winterfell.
+    /// located at <https://github.com/facebook/winterfell>.
     fn try_inverse(&self) -> Option<Self> {
         if self.is_zero() {
             return None;
@@ -147,7 +147,7 @@ impl Field for GoldilocksField {
     }
 
     fn from_noncanonical_biguint(n: BigUint) -> Self {
-        Self(n.mod_floor(&Self::order()).to_u64_digits()[0])
+        Self(n.mod_floor(&Self::order()).to_u64().unwrap())
     }
 
     #[inline(always)]
@@ -381,7 +381,7 @@ unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
 
 #[inline(always)]
 #[cfg(not(target_arch = "x86_64"))]
-unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+const unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
     let (res_wrapped, carry) = x.overflowing_add(y);
     // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
     res_wrapped + EPSILON * (carry as u64)
@@ -415,7 +415,7 @@ fn reduce128(x: u128) -> GoldilocksField {
 }
 
 #[inline]
-fn split(x: u128) -> (u64, u64) {
+const fn split(x: u128) -> (u64, u64) {
     (x as u64, (x >> 64) as u64)
 }
 
diff --git a/maybe_rayon/.cargo/katex-header.html b/maybe_rayon/.cargo/katex-header.html
new file mode 100644
index 0000000000..20723b5d27
--- /dev/null
+++ b/maybe_rayon/.cargo/katex-header.html
@@ -0,0 +1 @@
+../../.cargo/katex-header.html
\ No newline at end of file
diff --git a/maybe_rayon/Cargo.toml b/maybe_rayon/Cargo.toml
index 89499e7423..e436563215 100644
--- a/maybe_rayon/Cargo.toml
+++ b/maybe_rayon/Cargo.toml
@@ -10,3 +10,7 @@ parallel = ["rayon"]
 
 [dependencies]
 rayon = { version = "1.5.3", optional = true }
+
+# Display math equations properly in documentation
+[package.metadata.docs.rs]
+rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
diff --git a/plonky2/.cargo/katex-header.html b/plonky2/.cargo/katex-header.html
new file mode 100644
index 0000000000..20723b5d27
--- /dev/null
+++ b/plonky2/.cargo/katex-header.html
@@ -0,0 +1 @@
+../../.cargo/katex-header.html
\ No newline at end of file
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index ad586679de..4cc44cccd3 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -15,7 +15,7 @@ default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
 gate_testing = []
 parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"]
 std = ["anyhow/std", "rand/std", "itertools/use_std"]
-timing = ["std"]
+timing = ["std", "dep:web-time"]
 
 [dependencies]
 ahash = { version = "0.8.3", default-features = false, features = ["compile-time-rng"] } # NOTE: Be sure to keep this version the same as the dependency in `hashbrown`.
@@ -34,6 +34,7 @@ serde = { version = "1.0", default-features = false, features = ["derive", "rc"]
 serde_json = "1.0"
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
+web-time = { version = "1.0.0", optional = true }
 
 [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dependencies]
 getrandom = { version = "0.2", default-features = false, features = ["js"] }
@@ -78,3 +79,7 @@ harness = false
 [[bench]]
 name = "reverse_index_bits"
 harness = false
+
+# Display math equations properly in documentation
+[package.metadata.docs.rs]
+rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
diff --git a/plonky2/src/fri/mod.rs b/plonky2/src/fri/mod.rs
index 5121d755c8..207a2ea82c 100644
--- a/plonky2/src/fri/mod.rs
+++ b/plonky2/src/fri/mod.rs
@@ -1,3 +1,8 @@
+//! Fast Reed-Solomon IOP (FRI) protocol.
+//!
+//! It provides both a native implementation and an in-circuit version
+//! of the FRI verifier for recursive proof composition.
+
 use alloc::vec::Vec;
 
 use serde::Serialize;
@@ -15,6 +20,7 @@ mod validate_shape;
 pub mod verifier;
 pub mod witness_util;
 
+/// A configuration for the FRI protocol.
 #[derive(Debug, Clone, Eq, PartialEq, Serialize)]
 pub struct FriConfig {
     /// `rate = 2^{-rate_bits}`.
@@ -23,8 +29,10 @@ pub struct FriConfig {
     /// Height of Merkle tree caps.
     pub cap_height: usize,
 
+    /// Number of bits used for grinding.
     pub proof_of_work_bits: u32,
 
+    /// The reduction strategy to be applied at each layer during the commit phase.
     pub reduction_strategy: FriReductionStrategy,
 
     /// Number of query rounds to perform.
@@ -51,7 +59,7 @@ impl FriConfig {
         }
     }
 
-    pub fn num_cap_elements(&self) -> usize {
+    pub const fn num_cap_elements(&self) -> usize {
         1 << self.cap_height
     }
 }
@@ -85,11 +93,11 @@ impl FriParams {
         self.reduction_arity_bits.iter().copied().max()
     }
 
-    pub fn lde_bits(&self) -> usize {
+    pub const fn lde_bits(&self) -> usize {
         self.degree_bits + self.config.rate_bits
     }
 
-    pub fn lde_size(&self) -> usize {
+    pub const fn lde_size(&self) -> usize {
         1 << self.lde_bits()
     }
 
diff --git a/plonky2/src/fri/reduction_strategies.rs b/plonky2/src/fri/reduction_strategies.rs
index df273eea11..6e5752296e 100644
--- a/plonky2/src/fri/reduction_strategies.rs
+++ b/plonky2/src/fri/reduction_strategies.rs
@@ -1,10 +1,10 @@
 use alloc::vec;
 use alloc::vec::Vec;
-#[cfg(feature = "timing")]
-use std::time::Instant;
 
 use log::debug;
 use serde::Serialize;
+#[cfg(feature = "timing")]
+use web_time::Instant;
 
 /// A method for deciding what arity to use at each reduction layer.
 #[derive(Debug, Clone, Eq, PartialEq, Serialize)]
diff --git a/plonky2/src/gadgets/arithmetic.rs b/plonky2/src/gadgets/arithmetic.rs
index 858a4eaf07..9982628e02 100644
--- a/plonky2/src/gadgets/arithmetic.rs
+++ b/plonky2/src/gadgets/arithmetic.rs
@@ -1,3 +1,4 @@
+use alloc::string::{String, ToString};
 use alloc::vec;
 use alloc::vec::Vec;
 use core::borrow::Borrow;
@@ -190,7 +191,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         self.arithmetic(F::ONE, F::ONE, x, one, y)
     }
 
-    /// Add `n` `Target`s.
+    /// Adds `n` `Target`s.
     pub fn add_many<T>(&mut self, terms: impl IntoIterator<Item = T>) -> Target
     where
         T: Borrow<Target>,
@@ -223,7 +224,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
             .fold(self.one(), |acc, t| self.mul(acc, *t.borrow()))
     }
 
-    /// Exponentiate `base` to the power of `2^power_log`.
+    /// Exponentiates `base` to the power of `2^power_log`.
     pub fn exp_power_of_2(&mut self, base: Target, power_log: usize) -> Target {
         if power_log > self.num_base_arithmetic_ops_per_gate() {
             // Cheaper to just use `ExponentiateGate`.
@@ -238,7 +239,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
     }
 
     // TODO: Test
-    /// Exponentiate `base` to the power of `exponent`, given by its little-endian bits.
+    /// Exponentiates `base` to the power of `exponent`, given by its little-endian bits.
     pub fn exp_from_bits(
         &mut self,
         base: Target,
@@ -263,7 +264,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
     }
 
     // TODO: Test
-    /// Exponentiate `base` to the power of `exponent`, where `exponent < 2^num_bits`.
+    /// Exponentiates `base` to the power of `exponent`, where `exponent < 2^num_bits`.
     pub fn exp(&mut self, base: Target, exponent: Target, num_bits: usize) -> Target {
         let exponent_bits = self.split_le(exponent, num_bits);
 
@@ -302,7 +303,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         product
     }
 
-    /// Exponentiate `base` to the power of a known `exponent`.
+    /// Exponentiates `base` to the power of a known `exponent`.
     // TODO: Test
     pub fn exp_u64(&mut self, base: Target, mut exponent: u64) -> Target {
         let mut exp_bits = Vec::new();
@@ -329,28 +330,32 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         self.inverse_extension(x_ext).0[0]
     }
 
+    /// Computes the logical NOT of the provided [`BoolTarget`].
     pub fn not(&mut self, b: BoolTarget) -> BoolTarget {
         let one = self.one();
         let res = self.sub(one, b.target);
         BoolTarget::new_unsafe(res)
     }
 
+    /// Computes the logical AND of the provided [`BoolTarget`]s.
     pub fn and(&mut self, b1: BoolTarget, b2: BoolTarget) -> BoolTarget {
         BoolTarget::new_unsafe(self.mul(b1.target, b2.target))
     }
 
-    /// computes the arithmetic extension of logical "or": `b1 + b2 - b1 * b2`
+    /// Computes the logical OR through the arithmetic expression: `b1 + b2 - b1 * b2`.
     pub fn or(&mut self, b1: BoolTarget, b2: BoolTarget) -> BoolTarget {
         let res_minus_b2 = self.arithmetic(-F::ONE, F::ONE, b1.target, b2.target, b1.target);
         BoolTarget::new_unsafe(self.add(res_minus_b2, b2.target))
     }
 
+    /// Outputs `x` if `b` is true, and else `y`, through the formula: `b*x + (1-b)*y`.
     pub fn _if(&mut self, b: BoolTarget, x: Target, y: Target) -> Target {
         let not_b = self.not(b);
         let maybe_x = self.mul(b.target, x);
         self.mul_add(not_b.target, y, maybe_x)
     }
 
+    /// Checks whether `x` and `y` are equal and outputs the boolean result.
     pub fn is_equal(&mut self, x: Target, y: Target) -> BoolTarget {
         let zero = self.zero();
 
diff --git a/plonky2/src/gadgets/arithmetic_extension.rs b/plonky2/src/gadgets/arithmetic_extension.rs
index 0fe8083aa3..3c1deac381 100644
--- a/plonky2/src/gadgets/arithmetic_extension.rs
+++ b/plonky2/src/gadgets/arithmetic_extension.rs
@@ -1,3 +1,4 @@
+use alloc::string::{String, ToString};
 use alloc::vec;
 use alloc::vec::Vec;
 use core::borrow::Borrow;
diff --git a/plonky2/src/gadgets/interpolation.rs b/plonky2/src/gadgets/interpolation.rs
index daf51d2103..6adbc42779 100644
--- a/plonky2/src/gadgets/interpolation.rs
+++ b/plonky2/src/gadgets/interpolation.rs
@@ -38,6 +38,9 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
 
 #[cfg(test)]
 mod tests {
+    #[cfg(not(feature = "std"))]
+    use alloc::vec::Vec;
+
     use anyhow::Result;
 
     use crate::field::extension::FieldExtension;
diff --git a/plonky2/src/gadgets/lookup.rs b/plonky2/src/gadgets/lookup.rs
index 826f3e2902..4ab765ba03 100644
--- a/plonky2/src/gadgets/lookup.rs
+++ b/plonky2/src/gadgets/lookup.rs
@@ -1,3 +1,6 @@
+use alloc::borrow::ToOwned;
+use alloc::vec;
+
 use crate::field::extension::Extendable;
 use crate::gates::lookup::LookupGate;
 use crate::gates::lookup_table::{LookupTable, LookupTableGate};
diff --git a/plonky2/src/gadgets/mod.rs b/plonky2/src/gadgets/mod.rs
index 9016211f97..cc14a83550 100644
--- a/plonky2/src/gadgets/mod.rs
+++ b/plonky2/src/gadgets/mod.rs
@@ -1,3 +1,7 @@
+//! Helper gadgets providing additional methods to
+//! [CircuitBuilder](crate::plonk::circuit_builder::CircuitBuilder),
+//! to ease circuit creation.
+
 pub mod arithmetic;
 pub mod arithmetic_extension;
 pub mod hash;
diff --git a/plonky2/src/gadgets/range_check.rs b/plonky2/src/gadgets/range_check.rs
index bdb35f9edc..41af064aa6 100644
--- a/plonky2/src/gadgets/range_check.rs
+++ b/plonky2/src/gadgets/range_check.rs
@@ -1,3 +1,4 @@
+use alloc::string::{String, ToString};
 use alloc::vec;
 use alloc::vec::Vec;
 
diff --git a/plonky2/src/gadgets/split_base.rs b/plonky2/src/gadgets/split_base.rs
index 0a39b8f00c..a2c98ac707 100644
--- a/plonky2/src/gadgets/split_base.rs
+++ b/plonky2/src/gadgets/split_base.rs
@@ -1,5 +1,6 @@
-use alloc::vec;
+use alloc::string::String;
 use alloc::vec::Vec;
+use alloc::{format, vec};
 use core::borrow::Borrow;
 
 use itertools::Itertools;
@@ -90,7 +91,7 @@ impl<F: RichField + Extendable<D>, const B: usize, const D: usize> SimpleGenerat
     for BaseSumGenerator<B>
 {
     fn id(&self) -> String {
-        "BaseSumGenerator".to_string()
+        format!("BaseSumGenerator + Base: {B}")
     }
 
     fn dependencies(&self) -> Vec<Target> {
diff --git a/plonky2/src/gadgets/split_join.rs b/plonky2/src/gadgets/split_join.rs
index fb83c3a6cc..6901c8caf2 100644
--- a/plonky2/src/gadgets/split_join.rs
+++ b/plonky2/src/gadgets/split_join.rs
@@ -1,3 +1,4 @@
+use alloc::string::{String, ToString};
 use alloc::vec;
 use alloc::vec::Vec;
 
diff --git a/plonky2/src/gates/arithmetic_base.rs b/plonky2/src/gates/arithmetic_base.rs
index 631b1c3715..dfdd87e8c0 100644
--- a/plonky2/src/gates/arithmetic_base.rs
+++ b/plonky2/src/gates/arithmetic_base.rs
@@ -1,5 +1,5 @@
 use alloc::format;
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 
 use crate::field::extension::Extendable;
@@ -20,8 +20,8 @@ use crate::plonk::vars::{
 };
 use crate::util::serialization::{Buffer, IoResult, Read, Write};
 
-/// A gate which can perform a weighted multiply-add, i.e. `result = c0 x y + c1 z`. If the config
-/// supports enough routed wires, it can support several such operations in one gate.
+/// A gate which can perform a weighted multiply-add, i.e. `result = c0.x.y + c1.z`. If the config
+/// has enough routed wires, it can support several such operations in one gate.
 #[derive(Debug, Clone)]
 pub struct ArithmeticGate {
     /// Number of arithmetic operations performed by an arithmetic gate.
@@ -29,28 +29,28 @@ pub struct ArithmeticGate {
 }
 
 impl ArithmeticGate {
-    pub fn new_from_config(config: &CircuitConfig) -> Self {
+    pub const fn new_from_config(config: &CircuitConfig) -> Self {
         Self {
             num_ops: Self::num_ops(config),
         }
     }
 
     /// Determine the maximum number of operations that can fit in one gate for the given config.
-    pub(crate) fn num_ops(config: &CircuitConfig) -> usize {
+    pub(crate) const fn num_ops(config: &CircuitConfig) -> usize {
         let wires_per_op = 4;
         config.num_routed_wires / wires_per_op
     }
 
-    pub fn wire_ith_multiplicand_0(i: usize) -> usize {
+    pub const fn wire_ith_multiplicand_0(i: usize) -> usize {
         4 * i
     }
-    pub fn wire_ith_multiplicand_1(i: usize) -> usize {
+    pub const fn wire_ith_multiplicand_1(i: usize) -> usize {
         4 * i + 1
     }
-    pub fn wire_ith_addend(i: usize) -> usize {
+    pub const fn wire_ith_addend(i: usize) -> usize {
         4 * i + 2
     }
-    pub fn wire_ith_output(i: usize) -> usize {
+    pub const fn wire_ith_output(i: usize) -> usize {
         4 * i + 3
     }
 }
diff --git a/plonky2/src/gates/arithmetic_extension.rs b/plonky2/src/gates/arithmetic_extension.rs
index 294c090274..a19c6b4a4b 100644
--- a/plonky2/src/gates/arithmetic_extension.rs
+++ b/plonky2/src/gates/arithmetic_extension.rs
@@ -1,5 +1,5 @@
 use alloc::format;
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use core::ops::Range;
 
@@ -16,8 +16,8 @@ use crate::plonk::circuit_data::{CircuitConfig, CommonCircuitData};
 use crate::plonk::vars::{EvaluationTargets, EvaluationVars, EvaluationVarsBase};
 use crate::util::serialization::{Buffer, IoResult, Read, Write};
 
-/// A gate which can perform a weighted multiply-add, i.e. `result = c0 x y + c1 z`. If the config
-/// supports enough routed wires, it can support several such operations in one gate.
+/// A gate which can perform a weighted multiply-add, i.e. `result = c0.x.y + c1.z`. If the config
+/// has enough routed wires, it can support several such operations in one gate.
 #[derive(Debug, Clone)]
 pub struct ArithmeticExtensionGate<const D: usize> {
     /// Number of arithmetic operations performed by an arithmetic gate.
@@ -25,28 +25,28 @@ pub struct ArithmeticExtensionGate<const D: usize> {
 }
 
 impl<const D: usize> ArithmeticExtensionGate<D> {
-    pub fn new_from_config(config: &CircuitConfig) -> Self {
+    pub const fn new_from_config(config: &CircuitConfig) -> Self {
         Self {
             num_ops: Self::num_ops(config),
         }
     }
 
     /// Determine the maximum number of operations that can fit in one gate for the given config.
-    pub(crate) fn num_ops(config: &CircuitConfig) -> usize {
+    pub(crate) const fn num_ops(config: &CircuitConfig) -> usize {
         let wires_per_op = 4 * D;
         config.num_routed_wires / wires_per_op
     }
 
-    pub fn wires_ith_multiplicand_0(i: usize) -> Range<usize> {
+    pub const fn wires_ith_multiplicand_0(i: usize) -> Range<usize> {
         4 * D * i..4 * D * i + D
     }
-    pub fn wires_ith_multiplicand_1(i: usize) -> Range<usize> {
+    pub const fn wires_ith_multiplicand_1(i: usize) -> Range<usize> {
         4 * D * i + D..4 * D * i + 2 * D
     }
-    pub fn wires_ith_addend(i: usize) -> Range<usize> {
+    pub const fn wires_ith_addend(i: usize) -> Range<usize> {
         4 * D * i + 2 * D..4 * D * i + 3 * D
     }
-    pub fn wires_ith_output(i: usize) -> Range<usize> {
+    pub const fn wires_ith_output(i: usize) -> Range<usize> {
         4 * D * i + 3 * D..4 * D * i + 4 * D
     }
 }
diff --git a/plonky2/src/gates/base_sum.rs b/plonky2/src/gates/base_sum.rs
index 181252a2d3..1d0f8f809e 100644
--- a/plonky2/src/gates/base_sum.rs
+++ b/plonky2/src/gates/base_sum.rs
@@ -31,7 +31,7 @@ pub struct BaseSumGate<const B: usize> {
 }
 
 impl<const B: usize> BaseSumGate<B> {
-    pub fn new(num_limbs: usize) -> Self {
+    pub const fn new(num_limbs: usize) -> Self {
         Self { num_limbs }
     }
 
@@ -45,7 +45,7 @@ impl<const B: usize> BaseSumGate<B> {
     pub const START_LIMBS: usize = 1;
 
     /// Returns the index of the `i`th limb wire.
-    pub fn limbs(&self) -> Range<usize> {
+    pub const fn limbs(&self) -> Range<usize> {
         Self::START_LIMBS..Self::START_LIMBS + self.num_limbs
     }
 }
@@ -179,7 +179,7 @@ impl<F: RichField + Extendable<D>, const B: usize, const D: usize> SimpleGenerat
     for BaseSplitGenerator<B>
 {
     fn id(&self) -> String {
-        "BaseSplitGenerator".to_string()
+        format!("BaseSplitGenerator + Base: {B}")
     }
 
     fn dependencies(&self) -> Vec<Target> {
diff --git a/plonky2/src/gates/constant.rs b/plonky2/src/gates/constant.rs
index 965b30b6fb..144e1ca352 100644
--- a/plonky2/src/gates/constant.rs
+++ b/plonky2/src/gates/constant.rs
@@ -27,7 +27,7 @@ pub struct ConstantGate {
 }
 
 impl ConstantGate {
-    pub fn new(num_consts: usize) -> Self {
+    pub const fn new(num_consts: usize) -> Self {
         Self { num_consts }
     }
 
diff --git a/plonky2/src/gates/coset_interpolation.rs b/plonky2/src/gates/coset_interpolation.rs
index c701b8cf7d..ab69f698be 100644
--- a/plonky2/src/gates/coset_interpolation.rs
+++ b/plonky2/src/gates/coset_interpolation.rs
@@ -1,4 +1,4 @@
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use alloc::{format, vec};
 use core::marker::PhantomData;
@@ -29,23 +29,26 @@ use crate::util::serialization::{Buffer, IoResult, Read, Write};
 /// - the values that the interpolated polynomial takes on the coset
 /// - the evaluation point
 ///
-/// The evaluation strategy is based on the observation that if P(X) is the interpolant of some
-/// values over a coset and P'(X) is the interpolant of those values over the subgroup, then
-/// P(X) = P'(X `shift`^{-1}). Interpolating P'(X) is preferable because when subgroup is fixed
+/// The evaluation strategy is based on the observation that if $P(X)$ is the interpolant of some
+/// values over a coset and $P'(X)$ is the interpolant of those values over the subgroup, then
+/// $P(X) = P'(X \cdot \mathrm{shift}^{-1})$. Interpolating $P'(X)$ is preferable because when subgroup is fixed
 /// then so are the Barycentric weights and both can be hardcoded into the constraint polynomials.
 ///
 /// A full interpolation of N values corresponds to the evaluation of a degree-N polynomial. This
 /// gate can however be configured with a bounded degree of at least 2 by introducing more
-/// non-routed wires. Let x[] be the domain points, v[] be the values, w[] be the Barycentric
-/// weights and z be the evaluation point. Define the sequences
+/// non-routed wires. Let $x[]$ be the domain points, $v[]$ be the values, $w[]$ be the Barycentric
+/// weights and $z$ be the evaluation point. Define the sequences
 ///
-/// p[0] = 1
-/// p[i] = p[i - 1] * (z - x[i - 1])
-/// e[0] = 0,
-/// e[i] = e[i - 1] * (z - x[i - 1]) + w[i - 1] * v[i - 1] * p[i - 1]
+/// $p[0] = 1,$
 ///
-/// Then e[N] is the final interpolated value. The non-routed wires hold every (d - 1)'th
-/// intermediate value of p and e, starting at p[d] and e[d], where d is the gate degree.
+/// $p[i] = p[i - 1] \cdot (z - x[i - 1]),$
+///
+/// $e[0] = 0,$
+///
+/// $e[i] = e[i - 1] ] \cdot (z - x[i - 1]) + w[i - 1] \cdot v[i - 1] \cdot p[i - 1]$
+///
+/// Then $e[N]$ is the final interpolated value. The non-routed wires hold every $(d - 1)$'th
+/// intermediate value of $p$ and $e$, starting at $p[d]$ and $e[d]$, where $d$ is the gate degree.
 #[derive(Clone, Debug, Default)]
 pub struct CosetInterpolationGate<F: RichField + Extendable<D>, const D: usize> {
     pub subgroup_bits: usize,
@@ -86,16 +89,16 @@ impl<F: RichField + Extendable<D>, const D: usize> CosetInterpolationGate<F, D>
         }
     }
 
-    fn num_points(&self) -> usize {
+    const fn num_points(&self) -> usize {
         1 << self.subgroup_bits
     }
 
     /// Wire index of the coset shift.
-    pub(crate) fn wire_shift(&self) -> usize {
+    pub(crate) const fn wire_shift(&self) -> usize {
         0
     }
 
-    fn start_values(&self) -> usize {
+    const fn start_values(&self) -> usize {
         1
     }
 
@@ -106,31 +109,31 @@ impl<F: RichField + Extendable<D>, const D: usize> CosetInterpolationGate<F, D>
         start..start + D
     }
 
-    fn start_evaluation_point(&self) -> usize {
+    const fn start_evaluation_point(&self) -> usize {
         self.start_values() + self.num_points() * D
     }
 
     /// Wire indices of the point to evaluate the interpolant at.
-    pub(crate) fn wires_evaluation_point(&self) -> Range<usize> {
+    pub(crate) const fn wires_evaluation_point(&self) -> Range<usize> {
         let start = self.start_evaluation_point();
         start..start + D
     }
 
-    fn start_evaluation_value(&self) -> usize {
+    const fn start_evaluation_value(&self) -> usize {
         self.start_evaluation_point() + D
     }
 
     /// Wire indices of the interpolated value.
-    pub(crate) fn wires_evaluation_value(&self) -> Range<usize> {
+    pub(crate) const fn wires_evaluation_value(&self) -> Range<usize> {
         let start = self.start_evaluation_value();
         start..start + D
     }
 
-    fn start_intermediates(&self) -> usize {
+    const fn start_intermediates(&self) -> usize {
         self.start_evaluation_value() + D
     }
 
-    pub fn num_routed_wires(&self) -> usize {
+    pub const fn num_routed_wires(&self) -> usize {
         self.start_intermediates()
     }
 
@@ -631,8 +634,6 @@ fn partial_interpolate_ext_algebra_target<F: RichField + Extendable<D>, const D:
 
 #[cfg(test)]
 mod tests {
-    use core::iter::repeat_with;
-
     use anyhow::Result;
     use plonky2_field::polynomial::PolynomialValues;
     use plonky2_util::log2_strict;
@@ -832,7 +833,7 @@ mod tests {
 
         // Get a working row for InterpolationGate.
         let shift = F::rand();
-        let values = PolynomialValues::new(repeat_with(FF::rand).take(4).collect());
+        let values = PolynomialValues::new(core::iter::repeat_with(FF::rand).take(4).collect());
         let eval_point = FF::rand();
         let gate = CosetInterpolationGate::<F, D>::with_max_degree(2, 3);
         let vars = EvaluationVars {
diff --git a/plonky2/src/gates/exponentiation.rs b/plonky2/src/gates/exponentiation.rs
index 521520e86a..0011f01143 100644
--- a/plonky2/src/gates/exponentiation.rs
+++ b/plonky2/src/gates/exponentiation.rs
@@ -1,4 +1,4 @@
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use alloc::{format, vec};
 use core::marker::PhantomData;
@@ -32,7 +32,7 @@ pub struct ExponentiationGate<F: RichField + Extendable<D>, const D: usize> {
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> ExponentiationGate<F, D> {
-    pub fn new(num_power_bits: usize) -> Self {
+    pub const fn new(num_power_bits: usize) -> Self {
         Self {
             num_power_bits,
             _phantom: PhantomData,
@@ -51,7 +51,7 @@ impl<F: RichField + Extendable<D>, const D: usize> ExponentiationGate<F, D> {
         max_for_routed_wires.min(max_for_wires)
     }
 
-    pub fn wire_base(&self) -> usize {
+    pub const fn wire_base(&self) -> usize {
         0
     }
 
@@ -61,7 +61,7 @@ impl<F: RichField + Extendable<D>, const D: usize> ExponentiationGate<F, D> {
         1 + i
     }
 
-    pub fn wire_output(&self) -> usize {
+    pub const fn wire_output(&self) -> usize {
         1 + self.num_power_bits
     }
 
diff --git a/plonky2/src/gates/gate.rs b/plonky2/src/gates/gate.rs
index 2f8f7df16a..cc8f7513c4 100644
--- a/plonky2/src/gates/gate.rs
+++ b/plonky2/src/gates/gate.rs
@@ -26,15 +26,46 @@ use crate::plonk::vars::{
 use crate::util::serialization::{Buffer, IoResult};
 
 /// A custom gate.
+///
+/// Vanilla Plonk arithmetization only supports basic fan-in 2 / fan-out 1 arithmetic gates,
+/// each of the form
+///
+/// $$ a.b \cdot q_M + a \cdot q_L + b \cdot q_R + c \cdot q_O + q_C = 0 $$
+///
+/// where:
+/// - $q_M$, $q_L$, $q_R$ and $q_O$ are boolean selectors,
+/// - $a$, $b$ and $c$ are values used as inputs and output respectively,
+/// - $q_C$ is a constant (possibly 0).
+///
+/// This allows expressing simple operations like multiplication, addition, etc. For
+/// instance, to define a multiplication, one can set $q_M=1$, $q_L=q_R=0$, $q_O = -1$ and $q_C = 0$.
+///
+/// Hence, the gate equation simplifies to $a.b - c = 0$, or equivalently to $a.b = c$.
+///
+/// However, such a gate is fairly limited for more complex computations. Hence, when a computation may
+/// require too many of these "vanilla" gates, or when a computation arises often within the same circuit,
+/// one may want to construct a tailored custom gate. These custom gates can use more selectors and are
+/// not necessarily limited to 2 inputs + 1 output = 3 wires.
+/// For instance, plonky2 supports natively a custom Poseidon hash gate that uses 135 wires.
+///
+/// Note however that extending the number of wires necessary for a custom gate comes at a price, and may
+/// impact the overall performances when generating proofs for a circuit containing them.
 pub trait Gate<F: RichField + Extendable<D>, const D: usize>: 'static + Send + Sync {
+    /// Defines a unique identifier for this custom gate.
+    ///
+    /// This is used as differentiating tag in gate serializers.
     fn id(&self) -> String;
 
+    /// Serializes this custom gate to the targeted byte buffer, with the provided [`CommonCircuitData`].
     fn serialize(&self, dst: &mut Vec<u8>, common_data: &CommonCircuitData<F, D>) -> IoResult<()>;
 
+    /// Deserializes the bytes in the provided buffer into this custom gate, given some [`CommonCircuitData`].
     fn deserialize(src: &mut Buffer, common_data: &CommonCircuitData<F, D>) -> IoResult<Self>
     where
         Self: Sized;
 
+    /// Defines and evaluates the constraints that enforce the statement represented by this gate.
+    /// Constraints must be defined in the extension of this custom gate base field.
     fn eval_unfiltered(&self, vars: EvaluationVars<F, D>) -> Vec<F::Extension>;
 
     /// Like `eval_unfiltered`, but specialized for points in the base field.
@@ -88,6 +119,12 @@ pub trait Gate<F: RichField + Extendable<D>, const D: usize>: 'static + Send + S
         res
     }
 
+    /// Defines the recursive constraints that enforce the statement represented by this custom gate.
+    /// This is necessary to recursively verify proofs generated from a circuit containing such gates.
+    ///
+    /// **Note**: The order of the recursive constraints output by this method should match exactly the order
+    /// of the constraints obtained by the non-recursive [`Gate::eval_unfiltered`] method, otherwise the
+    /// prover won't be able to generate proofs.
     fn eval_unfiltered_circuit(
         &self,
         builder: &mut CircuitBuilder<F, D>,
@@ -175,10 +212,20 @@ pub trait Gate<F: RichField + Extendable<D>, const D: usize>: 'static + Send + S
     }
 
     /// The generators used to populate the witness.
-    /// Note: This should return exactly 1 generator per operation in the gate.
+    ///
+    /// **Note**: This should return exactly 1 generator per operation in the gate.
     fn generators(&self, row: usize, local_constants: &[F]) -> Vec<WitnessGeneratorRef<F, D>>;
 
     /// The number of wires used by this gate.
+    ///
+    /// While vanilla Plonk can only evaluate one addition/multiplication at a time, a wider
+    /// configuration may be able to accommodate several identical gates at once. This is
+    /// particularly helpful for tiny custom gates that are being used extensively in circuits.
+    ///
+    /// For instance, the [crate::gates::multiplication_extension::MulExtensionGate] takes `3*D`
+    /// wires per multiplication (where `D`` is the degree of the extension), hence for a usual
+    /// configuration of 80 routed wires with D=2, one can evaluate 13 multiplications within a
+    /// single gate.
     fn num_wires(&self) -> usize;
 
     /// The number of constants used by this gate.
@@ -187,6 +234,7 @@ pub trait Gate<F: RichField + Extendable<D>, const D: usize>: 'static + Send + S
     /// The maximum degree among this gate's constraint polynomials.
     fn degree(&self) -> usize;
 
+    /// The number of constraints defined by this sole custom gate.
     fn num_constraints(&self) -> usize;
 
     /// Number of operations performed by the gate.
diff --git a/plonky2/src/gates/lookup.rs b/plonky2/src/gates/lookup.rs
index f682be23f2..42b3bb92fb 100644
--- a/plonky2/src/gates/lookup.rs
+++ b/plonky2/src/gates/lookup.rs
@@ -1,6 +1,6 @@
-use alloc::format;
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
+use alloc::{format, vec};
 use core::usize;
 
 use itertools::Itertools;
@@ -51,16 +51,16 @@ impl LookupGate {
             lut_hash: keccak(table_bytes).0,
         }
     }
-    pub(crate) fn num_slots(config: &CircuitConfig) -> usize {
+    pub(crate) const fn num_slots(config: &CircuitConfig) -> usize {
         let wires_per_lookup = 2;
         config.num_routed_wires / wires_per_lookup
     }
 
-    pub fn wire_ith_looking_inp(i: usize) -> usize {
+    pub const fn wire_ith_looking_inp(i: usize) -> usize {
         2 * i
     }
 
-    pub fn wire_ith_looking_out(i: usize) -> usize {
+    pub const fn wire_ith_looking_out(i: usize) -> usize {
         2 * i + 1
     }
 }
diff --git a/plonky2/src/gates/lookup_table.rs b/plonky2/src/gates/lookup_table.rs
index 9f9d967ea0..ad01e09209 100644
--- a/plonky2/src/gates/lookup_table.rs
+++ b/plonky2/src/gates/lookup_table.rs
@@ -1,7 +1,7 @@
-use alloc::format;
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::sync::Arc;
 use alloc::vec::Vec;
+use alloc::{format, vec};
 use core::usize;
 
 use itertools::Itertools;
@@ -56,23 +56,23 @@ impl LookupTableGate {
         }
     }
 
-    pub(crate) fn num_slots(config: &CircuitConfig) -> usize {
+    pub(crate) const fn num_slots(config: &CircuitConfig) -> usize {
         let wires_per_entry = 3;
         config.num_routed_wires / wires_per_entry
     }
 
     /// Wire for the looked input.
-    pub fn wire_ith_looked_inp(i: usize) -> usize {
+    pub const fn wire_ith_looked_inp(i: usize) -> usize {
         3 * i
     }
 
     // Wire for the looked output.
-    pub fn wire_ith_looked_out(i: usize) -> usize {
+    pub const fn wire_ith_looked_out(i: usize) -> usize {
         3 * i + 1
     }
 
     /// Wire for the multiplicity. Set after the trace has been generated.
-    pub fn wire_ith_multiplicity(i: usize) -> usize {
+    pub const fn wire_ith_multiplicity(i: usize) -> usize {
         3 * i + 2
     }
 }
diff --git a/plonky2/src/gates/mod.rs b/plonky2/src/gates/mod.rs
index 432f026470..e349cf7568 100644
--- a/plonky2/src/gates/mod.rs
+++ b/plonky2/src/gates/mod.rs
@@ -1,3 +1,26 @@
+//! plonky2 custom gates.
+//!
+//! Vanilla Plonk arithmetization only supports basic fan-in 2 / fan-out 1 arithmetic gates,
+//! each of the form
+//!
+//! $$ a.b.q_M + a.q_L + b.q_R + c.q_O + q_C = 0 $$
+//!
+//! where:
+//! - $q_M$, $q_L$, $q_R$ and $q_O$ are boolean selectors,
+//! - $a$, $b$ and $c$ are values used as inputs and output respectively,
+//! - $q_C$ is a constant (possibly 0).
+//!
+//! This allows expressing simple operations like multiplication, addition, etc. For
+//! instance, to define a multiplication, one can set $q_M=1$, $q_L=q_R=0$, $q_O = -1$ and $q_C = 0$.
+//!
+//! Hence, the gate equation simplifies to $a.b - c = 0$, or equivalently to $a.b = c$.
+//!
+//! However, such a gate is fairly limited for more complex computations. Hence, when a computation may
+//! require too many of these "vanilla" gates, or when a computation arises often within the same circuit,
+//! one may want to construct a tailored custom gate. These custom gates can use more selectors and are
+//! not necessarily limited to 2 inputs + 1 output = 3 wires.
+//! For instance, plonky2 supports natively a custom Poseidon hash gate that uses 135 wires.
+
 // Gates have `new` methods that return `GateRef`s.
 
 pub mod arithmetic_base;
diff --git a/plonky2/src/gates/multiplication_extension.rs b/plonky2/src/gates/multiplication_extension.rs
index 8f6b27db60..3f9fd8fe53 100644
--- a/plonky2/src/gates/multiplication_extension.rs
+++ b/plonky2/src/gates/multiplication_extension.rs
@@ -1,5 +1,5 @@
 use alloc::format;
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use core::ops::Range;
 
@@ -16,8 +16,8 @@ use crate::plonk::circuit_data::{CircuitConfig, CommonCircuitData};
 use crate::plonk::vars::{EvaluationTargets, EvaluationVars, EvaluationVarsBase};
 use crate::util::serialization::{Buffer, IoResult, Read, Write};
 
-/// A gate which can perform a weighted multiplication, i.e. `result = c0 x y`. If the config
-/// supports enough routed wires, it can support several such operations in one gate.
+/// A gate which can perform a weighted multiplication, i.e. `result = c0.x.y` on [`ExtensionTarget`].
+/// If the config has enough routed wires, it can support several such operations in one gate.
 #[derive(Debug, Clone)]
 pub struct MulExtensionGate<const D: usize> {
     /// Number of multiplications performed by the gate.
@@ -25,25 +25,25 @@ pub struct MulExtensionGate<const D: usize> {
 }
 
 impl<const D: usize> MulExtensionGate<D> {
-    pub fn new_from_config(config: &CircuitConfig) -> Self {
+    pub const fn new_from_config(config: &CircuitConfig) -> Self {
         Self {
             num_ops: Self::num_ops(config),
         }
     }
 
     /// Determine the maximum number of operations that can fit in one gate for the given config.
-    pub(crate) fn num_ops(config: &CircuitConfig) -> usize {
+    pub(crate) const fn num_ops(config: &CircuitConfig) -> usize {
         let wires_per_op = 3 * D;
         config.num_routed_wires / wires_per_op
     }
 
-    pub fn wires_ith_multiplicand_0(i: usize) -> Range<usize> {
+    pub const fn wires_ith_multiplicand_0(i: usize) -> Range<usize> {
         3 * D * i..3 * D * i + D
     }
-    pub fn wires_ith_multiplicand_1(i: usize) -> Range<usize> {
+    pub const fn wires_ith_multiplicand_1(i: usize) -> Range<usize> {
         3 * D * i + D..3 * D * i + 2 * D
     }
-    pub fn wires_ith_output(i: usize) -> Range<usize> {
+    pub const fn wires_ith_output(i: usize) -> Range<usize> {
         3 * D * i + 2 * D..3 * D * i + 3 * D
     }
 }
diff --git a/plonky2/src/gates/poseidon.rs b/plonky2/src/gates/poseidon.rs
index f6d0657260..3ba1b67b4e 100644
--- a/plonky2/src/gates/poseidon.rs
+++ b/plonky2/src/gates/poseidon.rs
@@ -1,4 +1,4 @@
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use alloc::{format, vec};
 use core::marker::PhantomData;
@@ -30,17 +30,17 @@ use crate::util::serialization::{Buffer, IoResult, Read, Write};
 pub struct PoseidonGate<F: RichField + Extendable<D>, const D: usize>(PhantomData<F>);
 
 impl<F: RichField + Extendable<D>, const D: usize> PoseidonGate<F, D> {
-    pub fn new() -> Self {
+    pub const fn new() -> Self {
         Self(PhantomData)
     }
 
     /// The wire index for the `i`th input to the permutation.
-    pub fn wire_input(i: usize) -> usize {
+    pub const fn wire_input(i: usize) -> usize {
         i
     }
 
     /// The wire index for the `i`th output to the permutation.
-    pub fn wire_output(i: usize) -> usize {
+    pub const fn wire_output(i: usize) -> usize {
         SPONGE_WIDTH + i
     }
 
@@ -90,7 +90,7 @@ impl<F: RichField + Extendable<D>, const D: usize> PoseidonGate<F, D> {
     }
 
     /// End of wire indices, exclusive.
-    fn end() -> usize {
+    const fn end() -> usize {
         Self::START_FULL_1 + SPONGE_WIDTH * poseidon::HALF_N_FULL_ROUNDS
     }
 }
@@ -532,6 +532,9 @@ impl<F: RichField + Extendable<D> + Poseidon, const D: usize> SimpleGenerator<F,
 
 #[cfg(test)]
 mod tests {
+    #[cfg(not(feature = "std"))]
+    use alloc::{vec, vec::Vec};
+
     use anyhow::Result;
 
     use crate::field::goldilocks_field::GoldilocksField;
diff --git a/plonky2/src/gates/poseidon_mds.rs b/plonky2/src/gates/poseidon_mds.rs
index 8e2f4a7664..9b97cac7ce 100644
--- a/plonky2/src/gates/poseidon_mds.rs
+++ b/plonky2/src/gates/poseidon_mds.rs
@@ -1,4 +1,4 @@
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use alloc::{format, vec};
 use core::marker::PhantomData;
@@ -25,7 +25,7 @@ use crate::util::serialization::{Buffer, IoResult, Read, Write};
 pub struct PoseidonMdsGate<F: RichField + Extendable<D> + Poseidon, const D: usize>(PhantomData<F>);
 
 impl<F: RichField + Extendable<D> + Poseidon, const D: usize> PoseidonMdsGate<F, D> {
-    pub fn new() -> Self {
+    pub const fn new() -> Self {
         Self(PhantomData)
     }
 
diff --git a/plonky2/src/gates/public_input.rs b/plonky2/src/gates/public_input.rs
index f770e2e6e1..8be41d4015 100644
--- a/plonky2/src/gates/public_input.rs
+++ b/plonky2/src/gates/public_input.rs
@@ -22,7 +22,7 @@ use crate::util::serialization::{Buffer, IoResult};
 pub struct PublicInputGate;
 
 impl PublicInputGate {
-    pub fn wires_public_inputs_hash() -> Range<usize> {
+    pub const fn wires_public_inputs_hash() -> Range<usize> {
         0..4
     }
 }
diff --git a/plonky2/src/gates/random_access.rs b/plonky2/src/gates/random_access.rs
index 9110a59b67..59af01a50d 100644
--- a/plonky2/src/gates/random_access.rs
+++ b/plonky2/src/gates/random_access.rs
@@ -1,4 +1,4 @@
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use alloc::{format, vec};
 use core::marker::PhantomData;
@@ -41,7 +41,7 @@ pub struct RandomAccessGate<F: RichField + Extendable<D>, const D: usize> {
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> RandomAccessGate<F, D> {
-    fn new(num_copies: usize, bits: usize, num_extra_constants: usize) -> Self {
+    const fn new(num_copies: usize, bits: usize, num_extra_constants: usize) -> Self {
         Self {
             bits,
             num_copies,
@@ -71,7 +71,7 @@ impl<F: RichField + Extendable<D>, const D: usize> RandomAccessGate<F, D> {
     }
 
     /// Length of the list being accessed.
-    fn vec_size(&self) -> usize {
+    const fn vec_size(&self) -> usize {
         1 << self.bits
     }
 
@@ -94,7 +94,7 @@ impl<F: RichField + Extendable<D>, const D: usize> RandomAccessGate<F, D> {
         (2 + self.vec_size()) * copy + 2 + i
     }
 
-    fn start_extra_constants(&self) -> usize {
+    const fn start_extra_constants(&self) -> usize {
         (2 + self.vec_size()) * self.num_copies
     }
 
@@ -104,7 +104,7 @@ impl<F: RichField + Extendable<D>, const D: usize> RandomAccessGate<F, D> {
     }
 
     /// All above wires are routed.
-    pub fn num_routed_wires(&self) -> usize {
+    pub const fn num_routed_wires(&self) -> usize {
         self.start_extra_constants() + self.num_extra_constants
     }
 
diff --git a/plonky2/src/gates/reducing.rs b/plonky2/src/gates/reducing.rs
index b313efe695..c9daf5382b 100644
--- a/plonky2/src/gates/reducing.rs
+++ b/plonky2/src/gates/reducing.rs
@@ -1,4 +1,4 @@
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use alloc::{format, vec};
 use core::ops::Range;
@@ -23,7 +23,7 @@ pub struct ReducingGate<const D: usize> {
 }
 
 impl<const D: usize> ReducingGate<D> {
-    pub fn new(num_coeffs: usize) -> Self {
+    pub const fn new(num_coeffs: usize) -> Self {
         Self { num_coeffs }
     }
 
@@ -31,23 +31,23 @@ impl<const D: usize> ReducingGate<D> {
         (num_routed_wires - 3 * D).min((num_wires - 2 * D) / (D + 1))
     }
 
-    pub fn wires_output() -> Range<usize> {
+    pub const fn wires_output() -> Range<usize> {
         0..D
     }
-    pub fn wires_alpha() -> Range<usize> {
+    pub const fn wires_alpha() -> Range<usize> {
         D..2 * D
     }
-    pub fn wires_old_acc() -> Range<usize> {
+    pub const fn wires_old_acc() -> Range<usize> {
         2 * D..3 * D
     }
     const START_COEFFS: usize = 3 * D;
-    pub fn wires_coeffs(&self) -> Range<usize> {
+    pub const fn wires_coeffs(&self) -> Range<usize> {
         Self::START_COEFFS..Self::START_COEFFS + self.num_coeffs
     }
-    fn start_accs(&self) -> usize {
+    const fn start_accs(&self) -> usize {
         Self::START_COEFFS + self.num_coeffs
     }
-    fn wires_accs(&self, i: usize) -> Range<usize> {
+    const fn wires_accs(&self, i: usize) -> Range<usize> {
         if i == self.num_coeffs - 1 {
             // The last accumulator is the output.
             return Self::wires_output();
diff --git a/plonky2/src/gates/reducing_extension.rs b/plonky2/src/gates/reducing_extension.rs
index 5492c50611..b1dc5e8538 100644
--- a/plonky2/src/gates/reducing_extension.rs
+++ b/plonky2/src/gates/reducing_extension.rs
@@ -1,4 +1,4 @@
-use alloc::string::String;
+use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use alloc::{format, vec};
 use core::ops::Range;
@@ -23,7 +23,7 @@ pub struct ReducingExtensionGate<const D: usize> {
 }
 
 impl<const D: usize> ReducingExtensionGate<D> {
-    pub fn new(num_coeffs: usize) -> Self {
+    pub const fn new(num_coeffs: usize) -> Self {
         Self { num_coeffs }
     }
 
@@ -33,20 +33,20 @@ impl<const D: usize> ReducingExtensionGate<D> {
         ((num_routed_wires - 3 * D) / D).min((num_wires - 2 * D) / (D * 2))
     }
 
-    pub fn wires_output() -> Range<usize> {
+    pub const fn wires_output() -> Range<usize> {
         0..D
     }
-    pub fn wires_alpha() -> Range<usize> {
+    pub const fn wires_alpha() -> Range<usize> {
         D..2 * D
     }
-    pub fn wires_old_acc() -> Range<usize> {
+    pub const fn wires_old_acc() -> Range<usize> {
         2 * D..3 * D
     }
     const START_COEFFS: usize = 3 * D;
-    pub fn wires_coeff(i: usize) -> Range<usize> {
+    pub const fn wires_coeff(i: usize) -> Range<usize> {
         Self::START_COEFFS + i * D..Self::START_COEFFS + (i + 1) * D
     }
-    fn start_accs(&self) -> usize {
+    const fn start_accs(&self) -> usize {
         Self::START_COEFFS + self.num_coeffs * D
     }
     fn wires_accs(&self, i: usize) -> Range<usize> {
diff --git a/plonky2/src/gates/selectors.rs b/plonky2/src/gates/selectors.rs
index 1018ba755b..be9a9da84a 100644
--- a/plonky2/src/gates/selectors.rs
+++ b/plonky2/src/gates/selectors.rs
@@ -40,7 +40,7 @@ pub enum LookupSelectors {
 }
 
 /// Returns selector polynomials for each LUT. We have two constraint domains (remember that gates are stored upside down):
-/// - [last_lut_row, first_lut_row] (Sum and RE transition contraints),
+/// - [last_lut_row, first_lut_row] (Sum and RE transition constraints),
 /// - [last_lu_row, last_lut_row - 1] (LDC column transition constraints).
 /// We also add two more:
 /// - {first_lut_row + 1} where we check the initial values of sum and RE (which are 0),
diff --git a/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs b/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs
index 10d81f280b..4b1d8dfb8d 100644
--- a/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs
+++ b/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs
@@ -89,7 +89,7 @@ unsafe fn add_with_wraparound(a: u64, b: u64) -> u64 {
         adj = lateout(reg) adj,
         options(pure, nomem, nostack),
     );
-    res + adj // adj is EPSILON if wraparound occured and 0 otherwise
+    res + adj // adj is EPSILON if wraparound occurred and 0 otherwise
 }
 
 /// Subtraction of a and (b >> 32) modulo ORDER accounting for wraparound.
@@ -152,7 +152,7 @@ unsafe fn multiply(x: u64, y: u64) -> u64 {
 // ==================================== STANDALONE CONST LAYER =====================================
 
 /// Standalone const layer. Run only once, at the start of round 1. Remaining const layers are fused
-/// with the preceeding MDS matrix multiplication.
+/// with the preceding MDS matrix multiplication.
 /*
 #[inline(always)]
 #[unroll_for_loops]
diff --git a/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx2_bmi2.rs b/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx2_bmi2.rs
index c7a65f9016..e56fea5ea7 100644
--- a/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx2_bmi2.rs
+++ b/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx2_bmi2.rs
@@ -18,7 +18,7 @@ use crate::util::branch_hint;
 
 const WIDTH: usize = 12;
 
-// These tranformed round constants are used where the constant layer is fused with the preceeding
+// These transformed round constants are used where the constant layer is fused with the preceding
 // MDS layer. The FUSED_ROUND_CONSTANTS for round i are the ALL_ROUND_CONSTANTS for round i + 1.
 // The FUSED_ROUND_CONSTANTS for the very last round are 0, as it is not followed by a constant
 // layer. On top of that, all FUSED_ROUND_CONSTANTS are shifted by 2 ** 63 to save a few XORs per
@@ -183,10 +183,10 @@ unsafe fn const_layer(
     // occur if all round constants are < 0xffffffff00000001 = ORDER: if the high bits are
     // 0xffffffff, then the low bits are 0, so the carry bit cannot occur. So this trick is valid
     // as long as all the round constants are in canonical form.
-    // The mask contains 0xffffffff in the high doubleword if wraparound occured and 0 otherwise.
+    // The mask contains 0xffffffff in the high doubleword if wraparound occurred and 0 otherwise.
     // We will ignore the low doubleword.
     let wraparound_mask = map3!(_mm256_cmpgt_epi32, state_s, res_maybe_wrapped_s);
-    // wraparound_adjustment contains 0xffffffff = EPSILON if wraparound occured and 0 otherwise.
+    // wraparound_adjustment contains 0xffffffff = EPSILON if wraparound occurred and 0 otherwise.
     let wraparound_adjustment = map3!(_mm256_srli_epi64::<32>, wraparound_mask);
     // XOR commutes with the addition below. Placing it here helps mask latency.
     let res_maybe_wrapped = map3!(_mm256_xor_si256, res_maybe_wrapped_s, rep sign_bit);
@@ -939,7 +939,7 @@ pub unsafe fn poseidon(state: &[GoldilocksField; 12]) -> [GoldilocksField; 12] {
     let state = load_state(state);
 
     // The first constant layer must be done explicitly. The remaining constant layers are fused
-    // with the preceeding MDS layer.
+    // with the preceding MDS layer.
     let state = const_layer(state, &ALL_ROUND_CONSTANTS[0..WIDTH].try_into().unwrap());
 
     let state = half_full_rounds(state, 0);
diff --git a/plonky2/src/hash/hashing.rs b/plonky2/src/hash/hashing.rs
index 28d3b89f28..f5fe1f1ef6 100644
--- a/plonky2/src/hash/hashing.rs
+++ b/plonky2/src/hash/hashing.rs
@@ -2,7 +2,6 @@
 
 use alloc::vec::Vec;
 use core::fmt::Debug;
-use std::iter::repeat;
 
 use crate::field::extension::Extendable;
 use crate::field::types::Field;
@@ -34,7 +33,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         num_outputs: usize,
     ) -> Vec<Target> {
         let zero = self.zero();
-        let mut state = H::AlgebraicPermutation::new(std::iter::repeat(zero));
+        let mut state = H::AlgebraicPermutation::new(core::iter::repeat(zero));
 
         // Absorb all input chunks.
         for input_chunk in inputs.chunks(H::AlgebraicPermutation::RATE) {
@@ -71,7 +70,7 @@ pub trait PlonkyPermutation<T: Copy + Default>:
     /// received; remaining state (if any) initialised with
     /// `T::default()`. To initialise remaining elements with a
     /// different value, instead of your original `iter` pass
-    /// `iter.chain(std::iter::repeat(F::from_canonical_u64(12345)))`
+    /// `iter.chain(core::iter::repeat(F::from_canonical_u64(12345)))`
     /// or similar.
     fn new<I: IntoIterator<Item = T>>(iter: I) -> Self;
 
@@ -103,7 +102,7 @@ pub fn compress<F: Field, P: PlonkyPermutation<F>>(x: HashOut<F>, y: HashOut<F>)
     debug_assert_eq!(y.elements.len(), NUM_HASH_OUT_ELTS);
     debug_assert!(P::RATE >= NUM_HASH_OUT_ELTS);
 
-    let mut perm = P::new(repeat(F::ZERO));
+    let mut perm = P::new(core::iter::repeat(F::ZERO));
     perm.set_from_slice(&x.elements, 0);
     perm.set_from_slice(&y.elements, NUM_HASH_OUT_ELTS);
 
@@ -120,7 +119,7 @@ pub fn hash_n_to_m_no_pad<F: RichField, P: PlonkyPermutation<F>>(
     inputs: &[F],
     num_outputs: usize,
 ) -> Vec<F> {
-    let mut perm = P::new(repeat(F::ZERO));
+    let mut perm = P::new(core::iter::repeat(F::ZERO));
 
     // Absorb all input chunks.
     for input_chunk in inputs.chunks(P::RATE) {
diff --git a/plonky2/src/hash/keccak.rs b/plonky2/src/hash/keccak.rs
index 43b02db42c..281220f309 100644
--- a/plonky2/src/hash/keccak.rs
+++ b/plonky2/src/hash/keccak.rs
@@ -1,6 +1,5 @@
 use alloc::vec;
 use alloc::vec::Vec;
-use core::iter;
 use core::mem::size_of;
 
 use itertools::Itertools;
@@ -68,7 +67,7 @@ impl<F: RichField> PlonkyPermutation<F> for KeccakPermutation<F> {
                 .copy_from_slice(&self.state[i].to_canonical_u64().to_le_bytes());
         }
 
-        let hash_onion = iter::repeat_with(|| {
+        let hash_onion = core::iter::repeat_with(|| {
             let output = keccak(state_bytes.clone()).to_fixed_bytes();
             state_bytes = output.to_vec();
             output
diff --git a/plonky2/src/hash/merkle_proofs.rs b/plonky2/src/hash/merkle_proofs.rs
index 14eb3a1cb3..c848f66ed0 100644
--- a/plonky2/src/hash/merkle_proofs.rs
+++ b/plonky2/src/hash/merkle_proofs.rs
@@ -132,7 +132,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
             perm_inputs.set_from_slice(&state.elements, 0);
             perm_inputs.set_from_slice(&sibling.elements, NUM_HASH_OUT_ELTS);
             // Ensure the rest of the state, if any, is zero:
-            perm_inputs.set_from_iter(std::iter::repeat(zero), 2 * NUM_HASH_OUT_ELTS);
+            perm_inputs.set_from_iter(core::iter::repeat(zero), 2 * NUM_HASH_OUT_ELTS);
             let perm_outs = self.permute_swapped::<H>(perm_inputs, bit);
             let hash_outs = perm_outs.squeeze()[0..NUM_HASH_OUT_ELTS]
                 .try_into()
diff --git a/plonky2/src/hash/mod.rs b/plonky2/src/hash/mod.rs
index b829392063..c98c57069c 100644
--- a/plonky2/src/hash/mod.rs
+++ b/plonky2/src/hash/mod.rs
@@ -1,3 +1,6 @@
+//! plonky2 hashing logic for in-circuit hashing and Merkle proof verification
+//! as well as specific hash functions implementation.
+
 mod arch;
 pub mod hash_types;
 pub mod hashing;
diff --git a/plonky2/src/hash/path_compression.rs b/plonky2/src/hash/path_compression.rs
index d4f7d5eb39..bc2f23b055 100644
--- a/plonky2/src/hash/path_compression.rs
+++ b/plonky2/src/hash/path_compression.rs
@@ -148,12 +148,15 @@ mod tests {
 
         assert_eq!(proofs, decompressed_proofs);
 
-        let compressed_proof_bytes = serde_cbor::to_vec(&compressed_proofs).unwrap();
-        println!(
-            "Compressed proof length: {} bytes",
-            compressed_proof_bytes.len()
-        );
-        let proof_bytes = serde_cbor::to_vec(&proofs).unwrap();
-        println!("Proof length: {} bytes", proof_bytes.len());
+        #[cfg(feature = "std")]
+        {
+            let compressed_proof_bytes = serde_cbor::to_vec(&compressed_proofs).unwrap();
+            println!(
+                "Compressed proof length: {} bytes",
+                compressed_proof_bytes.len()
+            );
+            let proof_bytes = serde_cbor::to_vec(&proofs).unwrap();
+            println!("Proof length: {} bytes", proof_bytes.len());
+        }
     }
 }
diff --git a/plonky2/src/hash/poseidon.rs b/plonky2/src/hash/poseidon.rs
index a89deda705..2d357b403a 100644
--- a/plonky2/src/hash/poseidon.rs
+++ b/plonky2/src/hash/poseidon.rs
@@ -3,7 +3,7 @@
 
 use alloc::vec;
 use alloc::vec::Vec;
-use std::fmt::Debug;
+use core::fmt::Debug;
 
 use unroll::unroll_for_loops;
 
@@ -36,7 +36,7 @@ pub const N_ROUNDS: usize = N_FULL_ROUNDS_TOTAL + N_PARTIAL_ROUNDS;
 const MAX_WIDTH: usize = 12; // we only have width 8 and 12, and 12 is bigger. :)
 
 #[inline(always)]
-fn add_u160_u128((x_lo, x_hi): (u128, u32), y: u128) -> (u128, u32) {
+const fn add_u160_u128((x_lo, x_hi): (u128, u32), y: u128) -> (u128, u32) {
     let (res_lo, over) = x_lo.overflowing_add(y);
     let res_hi = x_hi + (over as u32);
     (res_lo, res_hi)
@@ -753,6 +753,9 @@ impl<F: RichField> AlgebraicHasher<F> for PoseidonHash {
 
 #[cfg(test)]
 pub(crate) mod test_helpers {
+    #[cfg(not(feature = "std"))]
+    use alloc::vec::Vec;
+
     use crate::field::types::Field;
     use crate::hash::poseidon::{Poseidon, SPONGE_WIDTH};
 
diff --git a/plonky2/src/hash/poseidon_goldilocks.rs b/plonky2/src/hash/poseidon_goldilocks.rs
index e2c72d858f..12d061265e 100644
--- a/plonky2/src/hash/poseidon_goldilocks.rs
+++ b/plonky2/src/hash/poseidon_goldilocks.rs
@@ -315,7 +315,7 @@ mod poseidon12_mds {
 
     /// Split 3 x 4 FFT-based MDS vector-multiplication with the Poseidon circulant MDS matrix.
     #[inline(always)]
-    pub(crate) fn mds_multiply_freq(state: [u64; 12]) -> [u64; 12] {
+    pub(crate) const fn mds_multiply_freq(state: [u64; 12]) -> [u64; 12] {
         let [s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] = state;
 
         let (u0, u1, u2) = fft4_real([s0, s3, s6, s9]);
@@ -323,7 +323,7 @@ mod poseidon12_mds {
         let (u8, u9, u10) = fft4_real([s2, s5, s8, s11]);
 
         // This where the multiplication in frequency domain is done. More precisely, and with
-        // the appropriate permuations in between, the sequence of
+        // the appropriate permutations in between, the sequence of
         // 3-point FFTs --> multiplication by twiddle factors --> Hadamard multiplication -->
         // 3 point iFFTs --> multiplication by (inverse) twiddle factors
         // is "squashed" into one step composed of the functions "block1", "block2" and "block3".
@@ -343,7 +343,7 @@ mod poseidon12_mds {
     }
 
     #[inline(always)]
-    fn block1(x: [i64; 3], y: [i64; 3]) -> [i64; 3] {
+    const fn block1(x: [i64; 3], y: [i64; 3]) -> [i64; 3] {
         let [x0, x1, x2] = x;
         let [y0, y1, y2] = y;
         let z0 = x0 * y0 + x1 * y2 + x2 * y1;
@@ -354,7 +354,7 @@ mod poseidon12_mds {
     }
 
     #[inline(always)]
-    fn block2(x: [(i64, i64); 3], y: [(i64, i64); 3]) -> [(i64, i64); 3] {
+    const fn block2(x: [(i64, i64); 3], y: [(i64, i64); 3]) -> [(i64, i64); 3] {
         let [(x0r, x0i), (x1r, x1i), (x2r, x2i)] = x;
         let [(y0r, y0i), (y1r, y1i), (y2r, y2i)] = y;
         let x0s = x0r + x0i;
@@ -392,7 +392,7 @@ mod poseidon12_mds {
     }
 
     #[inline(always)]
-    fn block3(x: [i64; 3], y: [i64; 3]) -> [i64; 3] {
+    const fn block3(x: [i64; 3], y: [i64; 3]) -> [i64; 3] {
         let [x0, x1, x2] = x;
         let [y0, y1, y2] = y;
         let z0 = x0 * y0 - x1 * y2 - x2 * y1;
@@ -404,20 +404,20 @@ mod poseidon12_mds {
 
     /// Real 2-FFT over u64 integers.
     #[inline(always)]
-    pub(crate) fn fft2_real(x: [u64; 2]) -> [i64; 2] {
+    pub(crate) const fn fft2_real(x: [u64; 2]) -> [i64; 2] {
         [(x[0] as i64 + x[1] as i64), (x[0] as i64 - x[1] as i64)]
     }
 
     /// Real 2-iFFT over u64 integers.
     /// Division by two to complete the inverse FFT is not performed here.
     #[inline(always)]
-    pub(crate) fn ifft2_real_unreduced(y: [i64; 2]) -> [u64; 2] {
+    pub(crate) const fn ifft2_real_unreduced(y: [i64; 2]) -> [u64; 2] {
         [(y[0] + y[1]) as u64, (y[0] - y[1]) as u64]
     }
 
     /// Real 4-FFT over u64 integers.
     #[inline(always)]
-    pub(crate) fn fft4_real(x: [u64; 4]) -> (i64, (i64, i64), i64) {
+    pub(crate) const fn fft4_real(x: [u64; 4]) -> (i64, (i64, i64), i64) {
         let [z0, z2] = fft2_real([x[0], x[2]]);
         let [z1, z3] = fft2_real([x[1], x[3]]);
         let y0 = z0 + z1;
@@ -429,7 +429,7 @@ mod poseidon12_mds {
     /// Real 4-iFFT over u64 integers.
     /// Division by four to complete the inverse FFT is not performed here.
     #[inline(always)]
-    pub(crate) fn ifft4_real_unreduced(y: (i64, (i64, i64), i64)) -> [u64; 4] {
+    pub(crate) const fn ifft4_real_unreduced(y: (i64, (i64, i64), i64)) -> [u64; 4] {
         let z0 = y.0 + y.2;
         let z1 = y.0 - y.2;
         let z2 = y.1 .0;
@@ -444,6 +444,9 @@ mod poseidon12_mds {
 
 #[cfg(test)]
 mod tests {
+    #[cfg(not(feature = "std"))]
+    use alloc::{vec, vec::Vec};
+
     use crate::field::goldilocks_field::GoldilocksField as F;
     use crate::field::types::{Field, PrimeField64};
     use crate::hash::poseidon::test_helpers::{check_consistency, check_test_vectors};
diff --git a/plonky2/src/iop/challenger.rs b/plonky2/src/iop/challenger.rs
index 9df49996c5..d7b3c23795 100644
--- a/plonky2/src/iop/challenger.rs
+++ b/plonky2/src/iop/challenger.rs
@@ -30,7 +30,7 @@ pub struct Challenger<F: RichField, H: Hasher<F>> {
 impl<F: RichField, H: Hasher<F>> Challenger<F, H> {
     pub fn new() -> Challenger<F, H> {
         Challenger {
-            sponge_state: H::Permutation::new(std::iter::repeat(F::ZERO)),
+            sponge_state: H::Permutation::new(core::iter::repeat(F::ZERO)),
             input_buffer: Vec::with_capacity(H::Permutation::RATE),
             output_buffer: Vec::with_capacity(H::Permutation::RATE),
         }
@@ -175,7 +175,7 @@ impl<F: RichField + Extendable<D>, H: AlgebraicHasher<F>, const D: usize>
     pub fn new(builder: &mut CircuitBuilder<F, D>) -> Self {
         let zero = builder.zero();
         Self {
-            sponge_state: H::AlgebraicPermutation::new(std::iter::repeat(zero)),
+            sponge_state: H::AlgebraicPermutation::new(core::iter::repeat(zero)),
             input_buffer: Vec::new(),
             output_buffer: Vec::new(),
             __: PhantomData,
@@ -293,6 +293,9 @@ impl<F: RichField + Extendable<D>, H: AlgebraicHasher<F>, const D: usize>
 
 #[cfg(test)]
 mod tests {
+    #[cfg(not(feature = "std"))]
+    use alloc::vec::Vec;
+
     use crate::field::types::Sample;
     use crate::iop::challenger::{Challenger, RecursiveChallenger};
     use crate::iop::generator::generate_partial_witness;
diff --git a/plonky2/src/iop/ext_target.rs b/plonky2/src/iop/ext_target.rs
index 21eb3e5539..c64d96e872 100644
--- a/plonky2/src/iop/ext_target.rs
+++ b/plonky2/src/iop/ext_target.rs
@@ -9,6 +9,10 @@ use crate::iop::target::Target;
 use crate::plonk::circuit_builder::CircuitBuilder;
 
 /// `Target`s representing an element of an extension field.
+///
+/// This is typically used in recursion settings, where the outer circuit must verify
+/// a proof satisfying an inner circuit's statement, which is verified using arithmetic
+/// in an extension of the base field.
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
 pub struct ExtensionTarget<const D: usize>(pub [Target; D]);
 
@@ -19,7 +23,7 @@ impl<const D: usize> Default for ExtensionTarget<D> {
 }
 
 impl<const D: usize> ExtensionTarget<D> {
-    pub fn to_target_array(&self) -> [Target; D] {
+    pub const fn to_target_array(&self) -> [Target; D] {
         self.0
     }
 
@@ -77,7 +81,7 @@ impl<const D: usize> TryFrom<Vec<Target>> for ExtensionTarget<D> {
 pub struct ExtensionAlgebraTarget<const D: usize>(pub [ExtensionTarget<D>; D]);
 
 impl<const D: usize> ExtensionAlgebraTarget<D> {
-    pub fn to_ext_target_array(&self) -> [ExtensionTarget<D>; D] {
+    pub const fn to_ext_target_array(&self) -> [ExtensionTarget<D>; D] {
         self.0
     }
 }
diff --git a/plonky2/src/iop/generator.rs b/plonky2/src/iop/generator.rs
index 22d0fe0792..1704b34795 100644
--- a/plonky2/src/iop/generator.rs
+++ b/plonky2/src/iop/generator.rs
@@ -1,3 +1,5 @@
+use alloc::boxed::Box;
+use alloc::string::{String, ToString};
 use alloc::vec;
 use alloc::vec::Vec;
 use core::fmt::Debug;
diff --git a/plonky2/src/iop/target.rs b/plonky2/src/iop/target.rs
index b5efd89d74..705941e023 100644
--- a/plonky2/src/iop/target.rs
+++ b/plonky2/src/iop/target.rs
@@ -8,15 +8,25 @@ use crate::iop::wire::Wire;
 use crate::plonk::circuit_data::CircuitConfig;
 
 /// A location in the witness.
+///
+/// Targets can either be placed at a specific location, or be "floating" around,
+/// serving as intermediary value holders, and copied to other locations whenever needed.
+///
+/// When generating a proof for a given circuit, the prover will "set" the values of some
+/// (or all) targets, so that they satisfy the circuit constraints.  This is done through
+/// the [PartialWitness](crate::iop::witness::PartialWitness) interface.
+///
+/// There are different "variants" of the `Target` type, namely [`ExtensionTarget`],
+/// [ExtensionAlgebraTarget](crate::iop::ext_target::ExtensionAlgebraTarget).
+/// The `Target` type is the default one for most circuits verifying some simple statement.
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize, Deserialize)]
 pub enum Target {
+    /// A target that has a fixed location in the witness (seen as a `degree x num_wires` grid).
     Wire(Wire),
     /// A target that doesn't have any inherent location in the witness (but it can be copied to
     /// another target that does). This is useful for representing intermediate values in witness
     /// generation.
-    VirtualTarget {
-        index: usize,
-    },
+    VirtualTarget { index: usize },
 }
 
 impl Default for Target {
@@ -26,11 +36,11 @@ impl Default for Target {
 }
 
 impl Target {
-    pub fn wire(row: usize, column: usize) -> Self {
+    pub const fn wire(row: usize, column: usize) -> Self {
         Self::Wire(Wire { row, column })
     }
 
-    pub fn is_routable(&self, config: &CircuitConfig) -> bool {
+    pub const fn is_routable(&self, config: &CircuitConfig) -> bool {
         match self {
             Target::Wire(wire) => wire.is_routable(config),
             Target::VirtualTarget { .. } => true,
@@ -49,7 +59,7 @@ impl Target {
     }
 
     /// Conversion to an `ExtensionTarget`.
-    pub fn to_ext_target<const D: usize>(self, zero: Self) -> ExtensionTarget<D> {
+    pub const fn to_ext_target<const D: usize>(self, zero: Self) -> ExtensionTarget<D> {
         let mut arr = [zero; D];
         arr[0] = self;
         ExtensionTarget(arr)
@@ -66,7 +76,7 @@ pub struct BoolTarget {
 }
 
 impl BoolTarget {
-    pub fn new_unsafe(target: Target) -> BoolTarget {
+    pub const fn new_unsafe(target: Target) -> BoolTarget {
         BoolTarget {
             target,
             _private: (),
diff --git a/plonky2/src/iop/wire.rs b/plonky2/src/iop/wire.rs
index 5f8d3b23ab..435479ce7b 100644
--- a/plonky2/src/iop/wire.rs
+++ b/plonky2/src/iop/wire.rs
@@ -15,7 +15,7 @@ pub struct Wire {
 }
 
 impl Wire {
-    pub fn is_routable(&self, config: &CircuitConfig) -> bool {
+    pub const fn is_routable(&self, config: &CircuitConfig) -> bool {
         self.column < config.num_routed_wires
     }
 
diff --git a/plonky2/src/iop/witness.rs b/plonky2/src/iop/witness.rs
index d5b8dd04da..cf74be512c 100644
--- a/plonky2/src/iop/witness.rs
+++ b/plonky2/src/iop/witness.rs
@@ -297,7 +297,7 @@ impl<F: Field> Witness<F> for PartialWitness<F> {
 
 /// `PartitionWitness` holds a disjoint-set forest of the targets respecting a circuit's copy constraints.
 /// The value of a target is defined to be the value of its root in the forest.
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct PartitionWitness<'a, F: Field> {
     pub values: Vec<Option<F>>,
     pub representative_map: &'a [usize],
diff --git a/plonky2/src/lib.rs b/plonky2/src/lib.rs
index c2913023f5..44bc2cf638 100644
--- a/plonky2/src/lib.rs
+++ b/plonky2/src/lib.rs
@@ -2,8 +2,9 @@
 #![allow(clippy::needless_range_loop)]
 #![cfg_attr(not(feature = "std"), no_std)]
 
-extern crate alloc;
+pub extern crate alloc;
 
+/// Re-export of `plonky2_field`.
 #[doc(inline)]
 pub use plonky2_field as field;
 
diff --git a/plonky2/src/lookup_test.rs b/plonky2/src/lookup_test.rs
index af85decaeb..cb6b53f86b 100644
--- a/plonky2/src/lookup_test.rs
+++ b/plonky2/src/lookup_test.rs
@@ -1,28 +1,34 @@
-static LOGGER_INITIALIZED: Once = Once::new();
-
-use alloc::sync::Arc;
-use std::sync::Once;
+#[cfg(not(feature = "std"))]
+use alloc::{sync::Arc, vec, vec::Vec};
+#[cfg(feature = "std")]
+use std::sync::{Arc, Once};
 
 use itertools::Itertools;
-use log::{Level, LevelFilter};
+use log::Level;
 
+use crate::field::types::Field;
 use crate::gadgets::lookup::{OTHER_TABLE, SMALLER_TABLE, TIP5_TABLE};
 use crate::gates::lookup_table::LookupTable;
 use crate::gates::noop::NoopGate;
+use crate::iop::witness::{PartialWitness, WitnessWrite};
+use crate::plonk::circuit_builder::CircuitBuilder;
+use crate::plonk::circuit_data::CircuitConfig;
+use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
 use crate::plonk::prover::prove;
 use crate::util::timing::TimingTree;
 
+const D: usize = 2;
+type C = PoseidonGoldilocksConfig;
+type F = <C as GenericConfig<D>>::F;
+
+const LUT_SIZE: usize = u16::MAX as usize + 1;
+
+#[cfg(feature = "std")]
+static LOGGER_INITIALIZED: Once = Once::new();
+
 #[test]
 fn test_no_lookup() -> anyhow::Result<()> {
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
-    use crate::iop::witness::PartialWitness;
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
+    init_logger();
 
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
@@ -41,14 +47,7 @@ fn test_no_lookup() -> anyhow::Result<()> {
 #[should_panic]
 #[test]
 fn test_lookup_table_not_used() {
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
+    init_logger();
 
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
@@ -63,14 +62,7 @@ fn test_lookup_table_not_used() {
 #[should_panic]
 #[test]
 fn test_lookup_without_table() {
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
+    init_logger();
 
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
@@ -84,17 +76,8 @@ fn test_lookup_without_table() {
 // Tests two lookups in one lookup table.
 #[test]
 fn test_one_lookup() -> anyhow::Result<()> {
-    use crate::field::types::Field;
-    use crate::iop::witness::{PartialWitness, WitnessWrite};
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
+    init_logger();
 
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
-
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
     let tip5_table = TIP5_TABLE.to_vec();
     let table: LookupTable = Arc::new((0..256).zip_eq(tip5_table).collect());
     let config = CircuitConfig::standard_recursion_config();
@@ -145,18 +128,9 @@ fn test_one_lookup() -> anyhow::Result<()> {
 
 // Tests one lookup in two different lookup tables.
 #[test]
-pub fn test_two_luts() -> anyhow::Result<()> {
-    use crate::field::types::Field;
-    use crate::iop::witness::{PartialWitness, WitnessWrite};
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
-
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
+fn test_two_luts() -> anyhow::Result<()> {
+    init_logger();
+
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
 
@@ -229,17 +203,9 @@ pub fn test_two_luts() -> anyhow::Result<()> {
 }
 
 #[test]
-pub fn test_different_inputs() -> anyhow::Result<()> {
-    use crate::field::types::Field;
-    use crate::iop::witness::{PartialWitness, WitnessWrite};
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
+fn test_different_inputs() -> anyhow::Result<()> {
+    init_logger();
+
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
 
@@ -314,17 +280,9 @@ pub fn test_different_inputs() -> anyhow::Result<()> {
 
 // This test looks up over 514 values for one LookupTableGate, which means that several LookupGates are created.
 #[test]
-pub fn test_many_lookups() -> anyhow::Result<()> {
-    use crate::field::types::Field;
-    use crate::iop::witness::{PartialWitness, WitnessWrite};
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
+fn test_many_lookups() -> anyhow::Result<()> {
+    init_logger();
+
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
 
@@ -404,18 +362,9 @@ pub fn test_many_lookups() -> anyhow::Result<()> {
 
 // Tests whether, when adding the same LUT to the circuit, the circuit only adds one copy, with the same index.
 #[test]
-pub fn test_same_luts() -> anyhow::Result<()> {
-    use crate::field::types::Field;
-    use crate::iop::witness::{PartialWitness, WitnessWrite};
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
-
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
+fn test_same_luts() -> anyhow::Result<()> {
+    init_logger();
+
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
 
@@ -469,21 +418,11 @@ pub fn test_same_luts() -> anyhow::Result<()> {
 
 #[test]
 fn test_big_lut() -> anyhow::Result<()> {
-    use crate::field::types::Field;
-    use crate::iop::witness::{PartialWitness, WitnessWrite};
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
+    init_logger();
 
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
-
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
 
-    const LUT_SIZE: usize = u16::MAX as usize + 1;
     let inputs: [u16; LUT_SIZE] = core::array::from_fn(|i| i as u16);
     let lut_fn = |inp: u16| inp / 10;
     let lut_index = builder.add_lookup_table_from_fn(lut_fn, &inputs);
@@ -522,21 +461,11 @@ fn test_big_lut() -> anyhow::Result<()> {
 
 #[test]
 fn test_many_lookups_on_big_lut() -> anyhow::Result<()> {
-    use crate::field::types::Field;
-    use crate::iop::witness::{PartialWitness, WitnessWrite};
-    use crate::plonk::circuit_builder::CircuitBuilder;
-    use crate::plonk::circuit_data::CircuitConfig;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-
-    const D: usize = 2;
-    type C = PoseidonGoldilocksConfig;
-    type F = <C as GenericConfig<D>>::F;
+    init_logger();
 
-    LOGGER_INITIALIZED.call_once(|| init_logger().unwrap());
     let config = CircuitConfig::standard_recursion_config();
     let mut builder = CircuitBuilder::<F, D>::new(config);
 
-    const LUT_SIZE: usize = u16::MAX as usize + 1;
     let inputs: [u16; LUT_SIZE] = core::array::from_fn(|i| i as u16);
     let lut_fn = |inp: u16| inp / 10;
     let lut_index = builder.add_lookup_table_from_fn(lut_fn, &inputs);
@@ -581,11 +510,15 @@ fn test_many_lookups_on_big_lut() -> anyhow::Result<()> {
     data.verify(proof)
 }
 
-fn init_logger() -> anyhow::Result<()> {
-    let mut builder = env_logger::Builder::from_default_env();
-    builder.format_timestamp(None);
-    builder.filter_level(LevelFilter::Debug);
+fn init_logger() {
+    #[cfg(feature = "std")]
+    {
+        LOGGER_INITIALIZED.call_once(|| {
+            let mut builder = env_logger::Builder::from_default_env();
+            builder.format_timestamp(None);
+            builder.filter_level(log::LevelFilter::Debug);
 
-    builder.try_init()?;
-    Ok(())
+            builder.try_init().unwrap();
+        });
+    }
 }
diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs
index 67db68649a..4c2a536905 100644
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@@ -1,3 +1,5 @@
+//! Logic for building plonky2 circuits.
+
 use alloc::collections::BTreeMap;
 use alloc::sync::Arc;
 use alloc::vec;
@@ -8,7 +10,7 @@ use std::time::Instant;
 
 use hashbrown::{HashMap, HashSet};
 use itertools::Itertools;
-use log::{debug, info, Level};
+use log::{debug, info, warn, Level};
 use plonky2_util::ceil_div_usize;
 
 use crate::field::cosets::get_unique_coset_shifts;
@@ -83,7 +85,60 @@ pub struct LookupWire {
     /// Index of the first lookup table row (i.e. the last `LookupTableGate`).
     pub first_lut_gate: usize,
 }
+
+/// Structure used to construct a plonky2 circuit. It provides all the necessary toolkit that,
+/// from an initial circuit configuration, will enable one to design a circuit and its associated
+/// prover/verifier data.
+///
+/// # Usage
+///
+/// ```rust
+/// use plonky2::plonk::circuit_data::CircuitConfig;
+/// use plonky2::iop::witness::PartialWitness;
+/// use plonky2::plonk::circuit_builder::CircuitBuilder;
+/// use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
+/// use plonky2::field::types::Field;
+///
+/// // Define parameters for this circuit
+/// const D: usize = 2;
+/// type C = PoseidonGoldilocksConfig;
+/// type F = <C as GenericConfig<D>>::F;
+///
+/// let config = CircuitConfig::standard_recursion_config();
+/// let mut builder = CircuitBuilder::<F, D>::new(config);
+///
+/// // Build a circuit for the statement: "I know the 100th term
+/// // of the Fibonacci sequence, starting from 0 and 1".
+/// let initial_a = builder.constant(F::ZERO);
+/// let initial_b = builder.constant(F::ONE);
+/// let mut prev_target = initial_a;
+/// let mut cur_target = initial_b;
+/// for _ in 0..99 {
+///     // Encode an addition of the two previous terms
+///     let temp = builder.add(prev_target, cur_target);
+///     // Shift the two previous terms with the new value
+///     prev_target = cur_target;
+///     cur_target = temp;
+/// }
+///
+/// // The only public input is the result (which is generated).
+/// builder.register_public_input(cur_target);
+///
+/// // Build the circuit
+/// let circuit_data = builder.build::<C>();
+///
+/// // Now compute the witness and generate a proof
+/// let mut pw = PartialWitness::new();
+///
+/// // There are no public inputs to register, as the only one
+/// // will be generated while proving the statement.
+/// let proof = circuit_data.prove(pw).unwrap();
+///
+/// // Verify the proof
+/// assert!(circuit_data.verify(proof).is_ok());
+/// ```
 pub struct CircuitBuilder<F: RichField + Extendable<D>, const D: usize> {
+    /// Circuit configuration to be used by this [`CircuitBuilder`].
     pub config: CircuitConfig,
 
     /// A domain separator, which is included in the initial Fiat-Shamir seed. This is generally not
@@ -126,7 +181,8 @@ pub struct CircuitBuilder<F: RichField + Extendable<D>, const D: usize> {
     /// List of constant generators used to fill the constant wires.
     constant_generators: Vec<ConstantGenerator<F>>,
 
-    /// Rows for each LUT: LookupWire contains: first `LookupGate`, first `LookupTableGate`, last `LookupTableGate`.
+    /// Rows for each LUT: [`LookupWire`] contains: first [`LookupGate`], first and last
+    /// [LookupTableGate](crate::gates::lookup_table::LookupTableGate).
     lookup_rows: Vec<LookupWire>,
 
     /// For each LUT index, vector of `(looking_in, looking_out)` pairs.
@@ -146,6 +202,10 @@ pub struct CircuitBuilder<F: RichField + Extendable<D>, const D: usize> {
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
+    /// Given a [`CircuitConfig`], generate a new [`CircuitBuilder`] instance.
+    /// It will also check that the configuration provided is consistent, i.e.
+    /// that the different parameters provided can achieve the targeted security
+    /// level.
     pub fn new(config: CircuitConfig) -> Self {
         let builder = CircuitBuilder {
             config,
@@ -173,6 +233,8 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         builder
     }
 
+    /// Assert that the configuration used to create this `CircuitBuilder` is consistent,
+    /// i.e. that the different parameters meet the targeted security level.
     fn check_config(&self) {
         let &CircuitConfig {
             security_bits,
@@ -201,6 +263,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         self.domain_separator = Some(separator);
     }
 
+    /// Outputs the number of gates in this circuit.
     pub fn num_gates(&self) -> usize {
         self.gate_instances.len()
     }
@@ -215,6 +278,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         targets.iter().for_each(|&t| self.register_public_input(t));
     }
 
+    /// Outputs the number of public inputs in this circuit.
     pub fn num_public_inputs(&self) -> usize {
         self.public_inputs.len()
     }
@@ -244,10 +308,13 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         self.lut_to_lookups[lut_index].push((looking_in, looking_out));
     }
 
+    /// Outputs the number of lookup tables in this circuit.
     pub fn num_luts(&self) -> usize {
         self.lut_to_lookups.len()
     }
 
+    /// Given an index, outputs the corresponding looking table in the set of tables
+    /// used in this circuit, as a sequence of target tuples `(input, output)`.
     pub fn get_lut_lookups(&self, lut_index: usize) -> &[(Target, Target)] {
         &self.lut_to_lookups[lut_index]
     }
@@ -262,22 +329,28 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         Target::VirtualTarget { index }
     }
 
+    /// Adds `n` new "virtual" targets.
     pub fn add_virtual_targets(&mut self, n: usize) -> Vec<Target> {
         (0..n).map(|_i| self.add_virtual_target()).collect()
     }
 
+    /// Adds `N` new "virtual" targets, arranged as an array.
     pub fn add_virtual_target_arr<const N: usize>(&mut self) -> [Target; N] {
         [0; N].map(|_| self.add_virtual_target())
     }
 
+    /// Adds a new `HashOutTarget`. `NUM_HASH_OUT_ELTS` being hardcoded to 4, it internally
+    /// adds 4 virtual targets in a vector fashion.
     pub fn add_virtual_hash(&mut self) -> HashOutTarget {
         HashOutTarget::from_vec(self.add_virtual_targets(4))
     }
 
+    /// Adds a new `MerkleCapTarget`, consisting in `1 << cap_height` `HashOutTarget`.
     pub fn add_virtual_cap(&mut self, cap_height: usize) -> MerkleCapTarget {
         MerkleCapTarget(self.add_virtual_hashes(1 << cap_height))
     }
 
+    /// Adds `n` new `HashOutTarget` in a vector fashion.
     pub fn add_virtual_hashes(&mut self, n: usize) -> Vec<HashOutTarget> {
         (0..n).map(|_i| self.add_virtual_hash()).collect()
     }
@@ -337,7 +410,9 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
     }
 
     /// Add a virtual verifier data, register it as a public input and set it to `self.verifier_data_public_input`.
-    /// WARNING: Do not register any public input after calling this! TODO: relax this
+    ///
+    /// **WARNING**: Do not register any public input after calling this!
+    // TODO: relax this
     pub fn add_verifier_data_public_inputs(&mut self) -> VerifierCircuitTarget {
         assert!(
             self.verifier_data_public_input.is_none(),
@@ -410,16 +485,12 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         );
     }
 
+    /// Adds a gate type to the set of gates to be used in this circuit. This can be useful
+    /// in conditional recursion to uniformize the set of gates of the different circuits.
     pub fn add_gate_to_gate_set(&mut self, gate: GateRef<F, D>) {
         self.gates.insert(gate);
     }
 
-    pub fn connect_extension(&mut self, src: ExtensionTarget<D>, dst: ExtensionTarget<D>) {
-        for i in 0..D {
-            self.connect(src.0[i], dst.0[i]);
-        }
-    }
-
     /// Adds a generator which will copy `src` to `dst`.
     pub fn generate_copy(&mut self, src: Target, dst: Target) {
         self.add_simple_generator(CopyGenerator { src, dst });
@@ -427,6 +498,8 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
 
     /// Uses Plonk's permutation argument to require that two elements be equal.
     /// Both elements must be routable, otherwise this method will panic.
+    ///
+    /// For an example of usage, see [`CircuitBuilder::assert_one()`].
     pub fn connect(&mut self, x: Target, y: Target) {
         assert!(
             x.is_routable(&self.config),
@@ -440,11 +513,34 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
             .push(CopyConstraint::new((x, y), self.context_log.open_stack()));
     }
 
+    /// Enforces that two [`ExtensionTarget<D>`] underlying values are equal.
+    pub fn connect_extension(&mut self, src: ExtensionTarget<D>, dst: ExtensionTarget<D>) {
+        for i in 0..D {
+            self.connect(src.0[i], dst.0[i]);
+        }
+    }
+
+    /// Enforces that a routable `Target` value is 0, using Plonk's permutation argument.
     pub fn assert_zero(&mut self, x: Target) {
         let zero = self.zero();
         self.connect(x, zero);
     }
 
+    /// Enforces that a routable `Target` value is 1, using Plonk's permutation argument.
+    ///
+    /// # Example
+    ///
+    /// Let say the circuit contains a target `a`, and a target `b` as public input so that the
+    /// prover can non-deterministically compute the multiplicative inverse of `a` when generating
+    /// a proof.
+    ///
+    /// One can then add the following constraint in the circuit to enforce that the value provided
+    /// by the prover is correct:
+    ///
+    /// ```ignore
+    /// let c = builder.mul(a, b);
+    /// builder.assert_one(c);
+    /// ```
     pub fn assert_one(&mut self, x: Target) {
         let one = self.one();
         self.connect(x, one);
@@ -479,10 +575,12 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         self.constant(F::NEG_ONE)
     }
 
+    /// Returns a routable boolean target set to false.
     pub fn _false(&mut self) -> BoolTarget {
         BoolTarget::new_unsafe(self.zero())
     }
 
+    /// Returns a routable boolean target set to true.
     pub fn _true(&mut self) -> BoolTarget {
         BoolTarget::new_unsafe(self.one())
     }
@@ -501,10 +599,12 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         target
     }
 
+    /// Returns a vector of routable targets with the given constant values.
     pub fn constants(&mut self, constants: &[F]) -> Vec<Target> {
         constants.iter().map(|&c| self.constant(c)).collect()
     }
 
+    /// Returns a routable target with the given constant boolean value.
     pub fn constant_bool(&mut self, b: bool) -> BoolTarget {
         if b {
             self._true()
@@ -513,12 +613,14 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         }
     }
 
+    /// Returns a routable [`HashOutTarget`].
     pub fn constant_hash(&mut self, h: HashOut<F>) -> HashOutTarget {
         HashOutTarget {
             elements: h.elements.map(|x| self.constant(x)),
         }
     }
 
+    /// Returns a routable [`MerkleCapTarget`].
     pub fn constant_merkle_cap<H: Hasher<F, Hash = HashOut<F>>>(
         &mut self,
         cap: &MerkleCap<F, H>,
@@ -545,7 +647,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         self.targets_to_constants.get(&target).cloned()
     }
 
-    /// If the given `ExtensionTarget` is a constant (i.e. it was created by the
+    /// If the given [`ExtensionTarget`] is a constant (i.e. it was created by the
     /// `constant_extension(F)` method), returns its constant value. Otherwise, returns `None`.
     pub fn target_as_constant_ext(&self, target: ExtensionTarget<D>) -> Option<F::Extension> {
         // Get a Vec of any coefficients that are constant. If we end up with exactly D of them,
@@ -704,7 +806,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
     }
 
     /// The number of (base field) `arithmetic` operations that can be performed in a single gate.
-    pub(crate) fn num_base_arithmetic_ops_per_gate(&self) -> usize {
+    pub(crate) const fn num_base_arithmetic_ops_per_gate(&self) -> usize {
         if self.config.use_base_arithmetic_gate {
             ArithmeticGate::new_from_config(&self.config).num_ops
         } else {
@@ -713,7 +815,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
     }
 
     /// The number of `arithmetic_extension` operations that can be performed in a single gate.
-    pub(crate) fn num_ext_arithmetic_ops_per_gate(&self) -> usize {
+    pub(crate) const fn num_ext_arithmetic_ops_per_gate(&self) -> usize {
         ArithmeticExtensionGate::<D>::new_from_config(&self.config).num_ops
     }
 
@@ -906,7 +1008,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
     /// In PLONK's permutation argument, there's a slight chance of division by zero. We can
     /// mitigate this by randomizing some unused witness elements, so if proving fails with
     /// division by zero, the next attempt will have an (almost) independent chance of success.
-    /// See https://github.com/0xPolygonZero/plonky2/issues/456
+    /// See <https://github.com/0xPolygonZero/plonky2/issues/456>.
     fn randomize_unused_pi_wires(&mut self, pi_gate: usize) {
         for wire in PublicInputGate::wires_public_inputs_hash().end..self.config.num_wires {
             self.add_simple_generator(RandomValueGenerator {
@@ -917,9 +1019,20 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
 
     /// Builds a "full circuit", with both prover and verifier data.
     pub fn build_with_options<C: GenericConfig<D, F = F>>(
-        mut self,
+        self,
         commit_to_sigma: bool,
     ) -> CircuitData<F, C, D> {
+        let (circuit_data, success) = self.try_build_with_options(commit_to_sigma);
+        if !success {
+            panic!("Failed to build circuit");
+        }
+        circuit_data
+    }
+
+    pub fn try_build_with_options<C: GenericConfig<D, F = F>>(
+        mut self,
+        commit_to_sigma: bool,
+    ) -> (CircuitData<F, C, D>, bool) {
         let mut timing = TimingTree::new("preprocess", Level::Trace);
 
         #[cfg(feature = "std")]
@@ -1125,8 +1238,14 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
             num_lookup_selectors,
             luts: self.luts,
         };
+
+        let mut success = true;
+
         if let Some(goal_data) = self.goal_common_data {
-            assert_eq!(goal_data, common, "The expected circuit data passed to cyclic recursion method did not match the actual circuit");
+            if goal_data != common {
+                warn!("The expected circuit data passed to cyclic recursion method did not match the actual circuit");
+                success = false;
+            }
         }
 
         let prover_only = ProverOnlyCircuitData::<F, C, D> {
@@ -1151,13 +1270,17 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         timing.print();
         #[cfg(feature = "std")]
         debug!("Building circuit took {}s", start.elapsed().as_secs_f32());
-        CircuitData {
-            prover_only,
-            verifier_only,
-            common,
-        }
+        (
+            CircuitData {
+                prover_only,
+                verifier_only,
+                common,
+            },
+            success,
+        )
     }
 
+    /// Builds a "full circuit", with both prover and verifier data.
     pub fn build<C: GenericConfig<D, F = F>>(self) -> CircuitData<F, C, D> {
         self.build_with_options(true)
     }
diff --git a/plonky2/src/plonk/circuit_data.rs b/plonky2/src/plonk/circuit_data.rs
index c93de8cb98..d9847f4be8 100644
--- a/plonky2/src/plonk/circuit_data.rs
+++ b/plonky2/src/plonk/circuit_data.rs
@@ -1,3 +1,17 @@
+//! Circuit data specific to the prover and the verifier.
+//!
+//! This module also defines a [`CircuitConfig`] to be customized
+//! when building circuits for arbitrary statements.
+//!
+//! After building a circuit, one obtains an instance of [`CircuitData`].
+//! This contains both prover and verifier data, allowing to generate
+//! proofs for the given circuit and verify them.
+//!
+//! Most of the [`CircuitData`] is actually prover-specific, and can be
+//! extracted by calling [`CircuitData::prover_data`] method.
+//! The verifier data can similarly be extracted by calling [`CircuitData::verifier_data`].
+//! This is useful to allow even small devices to verify plonky2 proofs.
+
 use alloc::collections::BTreeMap;
 use alloc::vec;
 use alloc::vec::Vec;
@@ -38,10 +52,22 @@ use crate::util::serialization::{
 };
 use crate::util::timing::TimingTree;
 
+/// Configuration to be used when building a circuit. This defines the shape of the circuit
+/// as well as its targeted security level and sub-protocol (e.g. FRI) parameters.
+///
+/// It supports a [`Default`] implementation tailored for recursion with Poseidon hash (of width 12)
+/// as internal hash function and FRI rate of 1/8.
 #[derive(Clone, Debug, Eq, PartialEq, Serialize)]
 pub struct CircuitConfig {
+    /// The number of wires available at each row. This corresponds to the "width" of the circuit,
+    /// and consists in the sum of routed wires and advice wires.
     pub num_wires: usize,
+    /// The number of routed wires, i.e. wires that will be involved in Plonk's permutation argument.
+    /// This allows copy constraints, i.e. enforcing that two distant values in a circuit are equal.
+    /// Non-routed wires are called advice wires.
     pub num_routed_wires: usize,
+    /// The number of constants that can be used per gate. If a gate requires more constants than the config
+    /// allows, the [`CircuitBuilder`] will complain when trying to add this gate to its set of gates.
     pub num_constants: usize,
     /// Whether to use a dedicated gate for base field arithmetic, rather than using a single gate
     /// for both base field and extension field arithmetic.
@@ -50,6 +76,8 @@ pub struct CircuitConfig {
     /// The number of challenge points to generate, for IOPs that have soundness errors of (roughly)
     /// `degree / |F|`.
     pub num_challenges: usize,
+    /// A boolean to activate the zero-knowledge property. When this is set to `false`, proofs *may*
+    /// leak additional information.
     pub zero_knowledge: bool,
     /// A cap on the quotient polynomial's degree factor. The actual degree factor is derived
     /// systematically, but will never exceed this value.
@@ -64,12 +92,12 @@ impl Default for CircuitConfig {
 }
 
 impl CircuitConfig {
-    pub fn num_advice_wires(&self) -> usize {
+    pub const fn num_advice_wires(&self) -> usize {
         self.num_wires - self.num_routed_wires
     }
 
     /// A typical recursion config, without zero-knowledge, targeting ~100 bit security.
-    pub fn standard_recursion_config() -> Self {
+    pub const fn standard_recursion_config() -> Self {
         Self {
             num_wires: 135,
             num_routed_wires: 80,
@@ -265,7 +293,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
 }
 
 /// Circuit data required by the prover.
-#[derive(Debug)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct VerifierCircuitData<
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
@@ -442,11 +470,11 @@ impl<F: RichField + Extendable<D>, const D: usize> CommonCircuitData<F, D> {
         self.fri_params.degree_bits
     }
 
-    pub fn degree(&self) -> usize {
+    pub const fn degree(&self) -> usize {
         1 << self.degree_bits()
     }
 
-    pub fn lde_size(&self) -> usize {
+    pub const fn lde_size(&self) -> usize {
         self.fri_params.lde_size()
     }
 
@@ -462,37 +490,37 @@ impl<F: RichField + Extendable<D>, const D: usize> CommonCircuitData<F, D> {
             .expect("No gates?")
     }
 
-    pub fn quotient_degree(&self) -> usize {
+    pub const fn quotient_degree(&self) -> usize {
         self.quotient_degree_factor * self.degree()
     }
 
     /// Range of the constants polynomials in the `constants_sigmas_commitment`.
-    pub fn constants_range(&self) -> Range<usize> {
+    pub const fn constants_range(&self) -> Range<usize> {
         0..self.num_constants
     }
 
     /// Range of the sigma polynomials in the `constants_sigmas_commitment`.
-    pub fn sigmas_range(&self) -> Range<usize> {
+    pub const fn sigmas_range(&self) -> Range<usize> {
         self.num_constants..self.num_constants + self.config.num_routed_wires
     }
 
     /// Range of the `z`s polynomials in the `zs_partial_products_commitment`.
-    pub fn zs_range(&self) -> Range<usize> {
+    pub const fn zs_range(&self) -> Range<usize> {
         0..self.config.num_challenges
     }
 
     /// Range of the partial products polynomials in the `zs_partial_products_lookup_commitment`.
-    pub fn partial_products_range(&self) -> Range<usize> {
+    pub const fn partial_products_range(&self) -> Range<usize> {
         self.config.num_challenges..(self.num_partial_products + 1) * self.config.num_challenges
     }
 
     /// Range of lookup polynomials in the `zs_partial_products_lookup_commitment`.
-    pub fn lookup_range(&self) -> RangeFrom<usize> {
+    pub const fn lookup_range(&self) -> RangeFrom<usize> {
         self.num_zs_partial_products_polys()..
     }
 
     /// Range of lookup polynomials needed for evaluation at `g * zeta`.
-    pub fn next_lookup_range(&self, i: usize) -> Range<usize> {
+    pub const fn next_lookup_range(&self, i: usize) -> Range<usize> {
         self.num_zs_partial_products_polys() + i * self.num_lookup_polys
             ..self.num_zs_partial_products_polys() + i * self.num_lookup_polys + 2
     }
@@ -573,7 +601,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CommonCircuitData<F, D> {
         )
     }
 
-    pub(crate) fn num_preprocessed_polys(&self) -> usize {
+    pub(crate) const fn num_preprocessed_polys(&self) -> usize {
         self.sigmas_range().end
     }
 
@@ -589,12 +617,12 @@ impl<F: RichField + Extendable<D>, const D: usize> CommonCircuitData<F, D> {
         )
     }
 
-    pub(crate) fn num_zs_partial_products_polys(&self) -> usize {
+    pub(crate) const fn num_zs_partial_products_polys(&self) -> usize {
         self.config.num_challenges * (1 + self.num_partial_products)
     }
 
     /// Returns the total number of lookup polynomials.
-    pub(crate) fn num_all_lookup_polys(&self) -> usize {
+    pub(crate) const fn num_all_lookup_polys(&self) -> usize {
         self.config.num_challenges * self.num_lookup_polys
     }
     fn fri_zs_polys(&self) -> Vec<FriPolynomialInfo> {
@@ -618,7 +646,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CommonCircuitData<F, D> {
                 ..self.num_zs_partial_products_polys() + self.num_all_lookup_polys(),
         )
     }
-    pub(crate) fn num_quotient_polys(&self) -> usize {
+    pub(crate) const fn num_quotient_polys(&self) -> usize {
         self.config.num_challenges * self.quotient_degree_factor
     }
 
diff --git a/plonky2/src/plonk/config.rs b/plonky2/src/plonk/config.rs
index 2391ef6cef..1ed40c40ce 100644
--- a/plonky2/src/plonk/config.rs
+++ b/plonky2/src/plonk/config.rs
@@ -1,3 +1,11 @@
+//! Hashing configuration to be used when building a circuit.
+//!
+//! This module defines a [`Hasher`] trait as well as its recursive
+//! counterpart [`AlgebraicHasher`] for in-circuit hashing. It also
+//! provides concrete configurations, one fully recursive leveraging
+//! the Poseidon hash function both internally and natively, and one
+//! mixing Poseidon internally and truncated Keccak externally.
+
 use alloc::vec;
 use alloc::vec::Vec;
 use core::fmt::Debug;
diff --git a/plonky2/src/plonk/copy_constraint.rs b/plonky2/src/plonk/copy_constraint.rs
index 50e85fbf2a..ea92ec1c9e 100644
--- a/plonky2/src/plonk/copy_constraint.rs
+++ b/plonky2/src/plonk/copy_constraint.rs
@@ -18,7 +18,7 @@ impl From<(Target, Target)> for CopyConstraint {
 }
 
 impl CopyConstraint {
-    pub fn new(pair: (Target, Target), name: String) -> Self {
+    pub const fn new(pair: (Target, Target), name: String) -> Self {
         Self { pair, name }
     }
 }
diff --git a/plonky2/src/plonk/mod.rs b/plonky2/src/plonk/mod.rs
index 604c1f7992..565b1c57c2 100644
--- a/plonky2/src/plonk/mod.rs
+++ b/plonky2/src/plonk/mod.rs
@@ -1,3 +1,8 @@
+//! plonky2 proving system.
+//!
+//! This module also defines the [CircuitBuilder](circuit_builder::CircuitBuilder)
+//! structure, used to build custom plonky2 circuits satisfying arbitrary statements.
+
 pub mod circuit_builder;
 pub mod circuit_data;
 pub mod config;
diff --git a/plonky2/src/plonk/plonk_common.rs b/plonky2/src/plonk/plonk_common.rs
index 53c75af1d0..ca8ea9196a 100644
--- a/plonky2/src/plonk/plonk_common.rs
+++ b/plonky2/src/plonk/plonk_common.rs
@@ -1,3 +1,5 @@
+//! Utility methods and constants for Plonk.
+
 use alloc::vec;
 use alloc::vec::Vec;
 
@@ -38,7 +40,7 @@ impl PlonkOracle {
     };
 }
 
-pub fn salt_size(salted: bool) -> usize {
+pub const fn salt_size(salted: bool) -> usize {
     if salted {
         SALT_SIZE
     } else {
diff --git a/plonky2/src/plonk/proof.rs b/plonky2/src/plonk/proof.rs
index bd93523397..de82746af1 100644
--- a/plonky2/src/plonk/proof.rs
+++ b/plonky2/src/plonk/proof.rs
@@ -1,3 +1,9 @@
+//! plonky2 proof definition.
+//!
+//! Proofs can be later compressed to reduce their size, into either
+//! [`CompressedProof`] or [`CompressedProofWithPublicInputs`] formats.
+//! The latter can be directly passed to a verifier to assert its correctness.
+
 use alloc::vec;
 use alloc::vec::Vec;
 
@@ -445,7 +451,10 @@ impl<const D: usize> OpeningSetTarget<D> {
 
 #[cfg(test)]
 mod tests {
-    use alloc::sync::Arc;
+    #[cfg(not(feature = "std"))]
+    use alloc::{sync::Arc, vec};
+    #[cfg(feature = "std")]
+    use std::sync::Arc;
 
     use anyhow::Result;
     use itertools::Itertools;
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index 41aebdb1e9..153610dcb6 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -1,3 +1,5 @@
+//! plonky2 prover implementation.
+
 use alloc::vec::Vec;
 use alloc::{format, vec};
 use core::cmp::min;
@@ -441,7 +443,7 @@ fn wires_permutation_partial_products_and_zs<
 }
 
 /// Computes lookup polynomials for a given challenge.
-/// The polynomials hold the value of RE, Sum and Ldc of the Tip5 paper (https://eprint.iacr.org/2023/107.pdf). To reduce their
+/// The polynomials hold the value of RE, Sum and Ldc of the Tip5 paper (<https://eprint.iacr.org/2023/107.pdf>). To reduce their
 /// numbers, we batch multiple slots in a single polynomial. Since RE only involves degree one constraints, we can batch
 /// all the slots of a row. For Sum and Ldc, batching increases the constraint degree, so we bound the number of
 /// partial polynomials according to `max_quotient_degree_factor`.
diff --git a/plonky2/src/plonk/vanishing_poly.rs b/plonky2/src/plonk/vanishing_poly.rs
index 2c53efcfd3..e3ddcf5b88 100644
--- a/plonky2/src/plonk/vanishing_poly.rs
+++ b/plonky2/src/plonk/vanishing_poly.rs
@@ -323,8 +323,8 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
     res_batch
 }
 
-/// Evaluates all lookup constraints, based on the logarithmic derivatives paper (https://eprint.iacr.org/2022/1530.pdf),
-/// following the Tip5 paper's implementation (https://eprint.iacr.org/2023/107.pdf).
+/// Evaluates all lookup constraints, based on the logarithmic derivatives paper (<https://eprint.iacr.org/2022/1530.pdf>),
+/// following the Tip5 paper's implementation (<https://eprint.iacr.org/2023/107.pdf>).
 ///
 /// There are three polynomials to check:
 /// - RE ensures the well formation of lookup tables;
diff --git a/plonky2/src/plonk/vars.rs b/plonky2/src/plonk/vars.rs
index 758018f5d6..b9d6d790ff 100644
--- a/plonky2/src/plonk/vars.rs
+++ b/plonky2/src/plonk/vars.rs
@@ -1,3 +1,5 @@
+//! Logic for evaluating constraints.
+
 use core::ops::Range;
 
 use crate::field::extension::algebra::ExtensionAlgebra;
@@ -80,11 +82,11 @@ impl<'a, F: Field> EvaluationVarsBaseBatch<'a, F> {
         self.local_constants = &self.local_constants[num_selectors * self.len()..];
     }
 
-    pub fn len(&self) -> usize {
+    pub const fn len(&self) -> usize {
         self.batch_size
     }
 
-    pub fn is_empty(&self) -> bool {
+    pub const fn is_empty(&self) -> bool {
         self.len() == 0
     }
 
@@ -100,7 +102,7 @@ impl<'a, F: Field> EvaluationVarsBaseBatch<'a, F> {
         }
     }
 
-    pub fn iter(&self) -> EvaluationVarsBaseBatchIter<'a, F> {
+    pub const fn iter(&self) -> EvaluationVarsBaseBatchIter<'a, F> {
         EvaluationVarsBaseBatchIter::new(*self)
     }
 
@@ -136,7 +138,7 @@ pub struct EvaluationVarsBaseBatchIter<'a, F: Field> {
 }
 
 impl<'a, F: Field> EvaluationVarsBaseBatchIter<'a, F> {
-    pub fn new(vars_batch: EvaluationVarsBaseBatch<'a, F>) -> Self {
+    pub const fn new(vars_batch: EvaluationVarsBaseBatch<'a, F>) -> Self {
         EvaluationVarsBaseBatchIter { i: 0, vars_batch }
     }
 }
diff --git a/plonky2/src/plonk/verifier.rs b/plonky2/src/plonk/verifier.rs
index b160fddc28..fa1bc14b84 100644
--- a/plonky2/src/plonk/verifier.rs
+++ b/plonky2/src/plonk/verifier.rs
@@ -1,3 +1,5 @@
+//! plonky2 verifier implementation.
+
 use anyhow::{ensure, Result};
 
 use crate::field::extension::Extendable;
diff --git a/plonky2/src/recursion/conditional_recursive_verifier.rs b/plonky2/src/recursion/conditional_recursive_verifier.rs
index 3f3b626751..a35b46ea03 100644
--- a/plonky2/src/recursion/conditional_recursive_verifier.rs
+++ b/plonky2/src/recursion/conditional_recursive_verifier.rs
@@ -336,6 +336,9 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
 
 #[cfg(test)]
 mod tests {
+    #[cfg(not(feature = "std"))]
+    use alloc::vec;
+
     use anyhow::Result;
     use hashbrown::HashMap;
 
diff --git a/plonky2/src/recursion/cyclic_recursion.rs b/plonky2/src/recursion/cyclic_recursion.rs
index 4d5fc60250..172c0826bc 100644
--- a/plonky2/src/recursion/cyclic_recursion.rs
+++ b/plonky2/src/recursion/cyclic_recursion.rs
@@ -1,5 +1,7 @@
 #![allow(clippy::int_plus_one)] // Makes more sense for some inequalities below.
 
+use alloc::vec::Vec;
+
 use anyhow::{ensure, Result};
 
 use crate::field::extension::Extendable;
@@ -196,6 +198,9 @@ where
 
 #[cfg(test)]
 mod tests {
+    #[cfg(not(feature = "std"))]
+    use alloc::vec;
+
     use anyhow::Result;
 
     use crate::field::extension::Extendable;
diff --git a/plonky2/src/recursion/dummy_circuit.rs b/plonky2/src/recursion/dummy_circuit.rs
index 620c979f4b..ee73105acc 100644
--- a/plonky2/src/recursion/dummy_circuit.rs
+++ b/plonky2/src/recursion/dummy_circuit.rs
@@ -1,3 +1,4 @@
+use alloc::string::{String, ToString};
 use alloc::vec;
 use alloc::vec::Vec;
 
diff --git a/plonky2/src/recursion/mod.rs b/plonky2/src/recursion/mod.rs
index 0e9cd2ccb3..438f600763 100644
--- a/plonky2/src/recursion/mod.rs
+++ b/plonky2/src/recursion/mod.rs
@@ -1,3 +1,9 @@
+//! Recursion logic for verifying recursively plonky2 circuits.
+//!
+//! This module also provides ways to perform conditional recursive verification
+//! (between two different circuits, depending on a condition), and cyclic
+//! recursion where a circuit implements its own verification logic.
+
 pub mod conditional_recursive_verifier;
 pub mod cyclic_recursion;
 pub mod dummy_circuit;
diff --git a/plonky2/src/recursion/recursive_verifier.rs b/plonky2/src/recursion/recursive_verifier.rs
index ada2b00242..2da2440844 100644
--- a/plonky2/src/recursion/recursive_verifier.rs
+++ b/plonky2/src/recursion/recursive_verifier.rs
@@ -191,7 +191,10 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
 
 #[cfg(test)]
 mod tests {
-    use alloc::sync::Arc;
+    #[cfg(not(feature = "std"))]
+    use alloc::{sync::Arc, vec};
+    #[cfg(feature = "std")]
+    use std::sync::Arc;
 
     use anyhow::Result;
     use itertools::Itertools;
@@ -690,12 +693,17 @@ mod tests {
         let proof_from_bytes = ProofWithPublicInputs::from_bytes(proof_bytes, common_data)?;
         assert_eq!(proof, &proof_from_bytes);
 
+        #[cfg(feature = "std")]
         let now = std::time::Instant::now();
+
         let compressed_proof = proof.clone().compress(&vd.circuit_digest, common_data)?;
         let decompressed_compressed_proof = compressed_proof
             .clone()
             .decompress(&vd.circuit_digest, common_data)?;
+
+        #[cfg(feature = "std")]
         info!("{:.4}s to compress proof", now.elapsed().as_secs_f64());
+
         assert_eq!(proof, &decompressed_compressed_proof);
 
         let compressed_proof_bytes = compressed_proof.to_bytes();
diff --git a/plonky2/src/util/context_tree.rs b/plonky2/src/util/context_tree.rs
index 565e2d35ee..a0a699710d 100644
--- a/plonky2/src/util/context_tree.rs
+++ b/plonky2/src/util/context_tree.rs
@@ -30,7 +30,7 @@ impl ContextTree {
     }
 
     /// Whether this context is still in scope.
-    fn is_open(&self) -> bool {
+    const fn is_open(&self) -> bool {
         self.exit_gate_count.is_none()
     }
 
diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs
index 9a54cea0a7..8f9960034d 100644
--- a/plonky2/src/util/mod.rs
+++ b/plonky2/src/util/mod.rs
@@ -1,3 +1,6 @@
+//! Utility module for helper methods and plonky2 serialization logic.
+
+#[cfg(not(feature = "std"))]
 use alloc::vec::Vec;
 
 use plonky2_maybe_rayon::*;
@@ -27,7 +30,7 @@ pub fn transpose<T: Send + Sync + Copy>(matrix: &[Vec<T>]) -> Vec<Vec<T>> {
         .collect()
 }
 
-pub(crate) fn reverse_bits(n: usize, num_bits: usize) -> usize {
+pub(crate) const fn reverse_bits(n: usize, num_bits: usize) -> usize {
     // NB: The only reason we need overflowing_shr() here as opposed
     // to plain '>>' is to accommodate the case n == num_bits == 0,
     // which would become `0 >> 64`. Rust thinks that any shift of 64
@@ -39,6 +42,10 @@ pub(crate) fn reverse_bits(n: usize, num_bits: usize) -> usize {
 
 #[cfg(test)]
 mod tests {
+
+    #[cfg(not(feature = "std"))]
+    use alloc::vec;
+
     use super::*;
 
     #[test]
diff --git a/plonky2/src/util/partial_products.rs b/plonky2/src/util/partial_products.rs
index 89be0fea86..e195af1a73 100644
--- a/plonky2/src/util/partial_products.rs
+++ b/plonky2/src/util/partial_products.rs
@@ -108,6 +108,9 @@ pub(crate) fn check_partial_products_circuit<F: RichField + Extendable<D>, const
 
 #[cfg(test)]
 mod tests {
+    #[cfg(not(feature = "std"))]
+    use alloc::vec;
+
     use super::*;
     use crate::field::goldilocks_field::GoldilocksField;
 
diff --git a/plonky2/src/util/reducing.rs b/plonky2/src/util/reducing.rs
index bde484e875..e1ba397b1c 100644
--- a/plonky2/src/util/reducing.rs
+++ b/plonky2/src/util/reducing.rs
@@ -28,7 +28,7 @@ pub struct ReducingFactor<F: Field> {
 }
 
 impl<F: Field> ReducingFactor<F> {
-    pub fn new(base: F) -> Self {
+    pub const fn new(base: F) -> Self {
         Self { base, count: 0 }
     }
 
@@ -117,7 +117,7 @@ pub struct ReducingFactorTarget<const D: usize> {
 }
 
 impl<const D: usize> ReducingFactorTarget<D> {
-    pub fn new(base: ExtensionTarget<D>) -> Self {
+    pub const fn new(base: ExtensionTarget<D>) -> Self {
         Self { base, count: 0 }
     }
 
diff --git a/plonky2/src/util/serialization/gate_serialization.rs b/plonky2/src/util/serialization/gate_serialization.rs
index 008e29c0bf..c5763fb0bf 100644
--- a/plonky2/src/util/serialization/gate_serialization.rs
+++ b/plonky2/src/util/serialization/gate_serialization.rs
@@ -1,3 +1,7 @@
+//! A module to help with GateRef serialization
+
+use alloc::vec::Vec;
+
 use plonky2_field::extension::Extendable;
 
 use crate::gates::gate::GateRef;
@@ -44,14 +48,18 @@ macro_rules! get_gate_tag_impl {
             Ok(tag)
         } else)*
         {
-            log::log!(log::Level::Error, "attempted to serialize gate with id `{}` which is unsupported by this gate serializer", $gate.0.id());
+            log::log!(
+                log::Level::Error,
+                "attempted to serialize gate with id `{}` which is unsupported by this gate serializer",
+                $gate.0.id()
+            );
             Err($crate::util::serialization::IoError)
         }
     }};
 }
 
 #[macro_export]
-/// Macro implementing the `GateSerializer` trait.
+/// Macro implementing the [`GateSerializer`] trait.
 /// To serialize a list of gates used for a circuit,
 /// this macro should be called with a struct on which to implement
 /// this as first argument, followed by all the targeted gates.
@@ -68,7 +76,7 @@ macro_rules! impl_gate_serializer {
 
         fn write_gate(
             &self,
-            buf: &mut Vec<u8>,
+            buf: &mut $crate::alloc::vec::Vec<u8>,
             gate: &$crate::gates::gate::GateRef<F, D>,
             common: &$crate::plonk::circuit_data::CommonCircuitData<F, D>,
         ) -> $crate::util::serialization::IoResult<()> {
diff --git a/plonky2/src/util/serialization/generator_serialization.rs b/plonky2/src/util/serialization/generator_serialization.rs
index 6e00340090..bad24cebf2 100644
--- a/plonky2/src/util/serialization/generator_serialization.rs
+++ b/plonky2/src/util/serialization/generator_serialization.rs
@@ -1,5 +1,7 @@
 //! A module to help with WitnessGeneratorRef serialization
 
+use alloc::vec::Vec;
+
 use plonky2_field::extension::Extendable;
 
 use crate::hash::hash_types::RichField;
@@ -50,14 +52,18 @@ macro_rules! get_generator_tag_impl {
             Ok(tag)
         } else)*
         {
-            log::log!(log::Level::Error, "attempted to serialize generator with id {} which is unsupported by this generator serializer", $generator.0.id());
+            log::log!(
+                log::Level::Error,
+                "attempted to serialize generator with id {} which is unsupported by this generator serializer",
+                $generator.0.id()
+            );
             Err($crate::util::serialization::IoError)
         }
     }};
 }
 
 #[macro_export]
-/// Macro implementing the `WitnessGeneratorSerializer` trait.
+/// Macro implementing the [`WitnessGeneratorSerializer`] trait.
 /// To serialize a list of generators used for a circuit,
 /// this macro should be called with a struct on which to implement
 /// this as first argument, followed by all the targeted generators.
@@ -74,7 +80,7 @@ macro_rules! impl_generator_serializer {
 
         fn write_generator(
             &self,
-            buf: &mut Vec<u8>,
+            buf: &mut $crate::alloc::vec::Vec<u8>,
             generator: &$crate::iop::generator::WitnessGeneratorRef<F, D>,
             common: &$crate::plonk::circuit_data::CommonCircuitData<F, D>,
         ) -> $crate::util::serialization::IoResult<()> {
diff --git a/plonky2/src/util/serialization/mod.rs b/plonky2/src/util/serialization/mod.rs
index ca95b4a137..94551bdfc6 100644
--- a/plonky2/src/util/serialization/mod.rs
+++ b/plonky2/src/util/serialization/mod.rs
@@ -134,7 +134,7 @@ pub trait Read {
     /// Reads a `usize` value from `self`.
     #[inline]
     fn read_usize(&mut self) -> IoResult<usize> {
-        let mut buf = [0; std::mem::size_of::<u64>()];
+        let mut buf = [0; core::mem::size_of::<u64>()];
         self.read_exact(&mut buf)?;
         Ok(u64::from_le_bytes(buf) as usize)
     }
@@ -2173,19 +2173,19 @@ pub struct Buffer<'a> {
 impl<'a> Buffer<'a> {
     /// Builds a new [`Buffer`] over `buffer`.
     #[inline]
-    pub fn new(bytes: &'a [u8]) -> Self {
+    pub const fn new(bytes: &'a [u8]) -> Self {
         Self { bytes, pos: 0 }
     }
 
     /// Returns the inner position.
     #[inline]
-    pub fn pos(&self) -> usize {
+    pub const fn pos(&self) -> usize {
         self.pos
     }
 
     /// Returns the inner buffer.
     #[inline]
-    pub fn bytes(&self) -> &'a [u8] {
+    pub const fn bytes(&self) -> &'a [u8] {
         self.bytes
     }
 
diff --git a/plonky2/src/util/strided_view.rs b/plonky2/src/util/strided_view.rs
index c165da2ca1..bab978a784 100644
--- a/plonky2/src/util/strided_view.rs
+++ b/plonky2/src/util/strided_view.rs
@@ -84,7 +84,7 @@ impl<'a, P: PackedField> PackedStridedView<'a, P> {
     }
 
     #[inline]
-    pub fn get(&self, index: usize) -> Option<&'a P> {
+    pub const fn get(&self, index: usize) -> Option<&'a P> {
         if index < self.length {
             // Cast scalar pointer to vector pointer.
             let res_ptr = unsafe { self.start_ptr.add(index * self.stride) }.cast();
@@ -109,7 +109,7 @@ impl<'a, P: PackedField> PackedStridedView<'a, P> {
     }
 
     #[inline]
-    pub fn iter(&self) -> PackedStridedViewIter<'a, P> {
+    pub const fn iter(&self) -> PackedStridedViewIter<'a, P> {
         PackedStridedViewIter::new(
             self.start_ptr,
             // See comment at the top of the `impl`. Below will point more than one byte past the
@@ -120,12 +120,12 @@ impl<'a, P: PackedField> PackedStridedView<'a, P> {
     }
 
     #[inline]
-    pub fn len(&self) -> usize {
+    pub const fn len(&self) -> usize {
         self.length
     }
 
     #[inline]
-    pub fn is_empty(&self) -> bool {
+    pub const fn is_empty(&self) -> bool {
         self.len() == 0
     }
 }
@@ -183,7 +183,7 @@ pub struct PackedStridedViewIter<'a, P: PackedField> {
 }
 
 impl<'a, P: PackedField> PackedStridedViewIter<'a, P> {
-    pub(self) fn new(start: *const P::Scalar, end: *const P::Scalar, stride: usize) -> Self {
+    pub(self) const fn new(start: *const P::Scalar, end: *const P::Scalar, stride: usize) -> Self {
         Self {
             start,
             end,
diff --git a/plonky2/src/util/timing.rs b/plonky2/src/util/timing.rs
index 4203303858..0ab721be52 100644
--- a/plonky2/src/util/timing.rs
+++ b/plonky2/src/util/timing.rs
@@ -1,7 +1,6 @@
-#[cfg(feature = "timing")]
-use std::time::{Duration, Instant};
-
 use log::{log, Level};
+#[cfg(feature = "timing")]
+use web_time::{Duration, Instant};
 
 /// The hierarchy of scopes, and the time consumed by each one. Useful for profiling.
 #[cfg(feature = "timing")]
@@ -54,7 +53,7 @@ impl TimingTree {
 
     /// Whether this scope is still in scope.
     #[cfg(feature = "timing")]
-    fn is_open(&self) -> bool {
+    const fn is_open(&self) -> bool {
         self.exit_time.is_none()
     }
 
diff --git a/rust-toolchain b/rust-toolchain
index 07ade694b1..471d867dd0 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1 +1 @@
-nightly
\ No newline at end of file
+nightly-2024-02-01
\ No newline at end of file
diff --git a/starky/.cargo/katex-header.html b/starky/.cargo/katex-header.html
new file mode 100644
index 0000000000..20723b5d27
--- /dev/null
+++ b/starky/.cargo/katex-header.html
@@ -0,0 +1 @@
+../../.cargo/katex-header.html
\ No newline at end of file
diff --git a/starky/Cargo.toml b/starky/Cargo.toml
index 62a67ee78e..0efae5fcf9 100644
--- a/starky/Cargo.toml
+++ b/starky/Cargo.toml
@@ -20,8 +20,14 @@ timing = ["plonky2/timing"]
 anyhow = { version = "1.0.40", default-features = false }
 itertools = { version = "0.11.0", default-features = false }
 log = { version = "0.4.14", default-features = false }
+num-bigint = { version = "0.4.3", default-features = false }
 plonky2_maybe_rayon = { path = "../maybe_rayon", default-features = false }
 plonky2 = { path = "../plonky2", default-features = false }
+plonky2_util = { path = "../util", default-features = false }
 
 [dev-dependencies]
 env_logger = { version = "0.9.0", default-features = false }
+
+# Display math equations properly in documentation
+[package.metadata.docs.rs]
+rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
diff --git a/starky/src/config.rs b/starky/src/config.rs
index a593c827c2..24ddb6a78f 100644
--- a/starky/src/config.rs
+++ b/starky/src/config.rs
@@ -14,7 +14,7 @@ pub struct StarkConfig {
 impl StarkConfig {
     /// A typical configuration with a rate of 2, resulting in fast but large proofs.
     /// Targets ~100 bit conjectured security.
-    pub fn standard_fast_config() -> Self {
+    pub const fn standard_fast_config() -> Self {
         Self {
             security_bits: 100,
             num_challenges: 2,
diff --git a/starky/src/fibonacci_stark.rs b/starky/src/fibonacci_stark.rs
index 28cd59f884..903c0abff4 100644
--- a/starky/src/fibonacci_stark.rs
+++ b/starky/src/fibonacci_stark.rs
@@ -11,7 +11,7 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::evaluation_frame::{StarkEvaluationFrame, StarkFrame};
-use crate::permutation::PermutationPair;
+use crate::lookup::{Column, Lookup};
 use crate::stark::Stark;
 use crate::util::trace_rows_to_poly_values;
 
@@ -34,22 +34,23 @@ impl<F: RichField + Extendable<D>, const D: usize> FibonacciStark<F, D> {
     // `num_rows`-th Fibonacci number.
     const PI_INDEX_RES: usize = 2;
 
-    fn new(num_rows: usize) -> Self {
+    const fn new(num_rows: usize) -> Self {
         Self {
             num_rows,
             _phantom: PhantomData,
         }
     }
 
-    /// Generate the trace using `x0, x1, 0, 1` as initial state values.
+    /// Generate the trace using `x0, x1, 0, 1, 1` as initial state values.
     fn generate_trace(&self, x0: F, x1: F) -> Vec<PolynomialValues<F>> {
         let mut trace_rows = (0..self.num_rows)
-            .scan([x0, x1, F::ZERO, F::ONE], |acc, _| {
+            .scan([x0, x1, F::ZERO, F::ONE, F::ONE], |acc, _| {
                 let tmp = *acc;
                 acc[0] = tmp[1];
                 acc[1] = tmp[0] + tmp[1];
                 acc[2] = tmp[2] + F::ONE;
                 acc[3] = tmp[3] + F::ONE;
+                // acc[4] (i.e. frequency column) remains unchanged, as we're permuting a strictly monotonous sequence.
                 Some(tmp)
             })
             .collect::<Vec<_>>();
@@ -58,7 +59,7 @@ impl<F: RichField + Extendable<D>, const D: usize> FibonacciStark<F, D> {
     }
 }
 
-const COLUMNS: usize = 4;
+const COLUMNS: usize = 5;
 const PUBLIC_INPUTS: usize = 3;
 
 impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStark<F, D> {
@@ -127,8 +128,13 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStar
         2
     }
 
-    fn permutation_pairs(&self) -> Vec<PermutationPair> {
-        vec![PermutationPair::singletons(2, 3)]
+    fn lookups(&self) -> Vec<Lookup<F>> {
+        vec![Lookup {
+            columns: vec![Column::single(2)],
+            table_column: Column::single(3),
+            frequencies_column: Column::single(4),
+            filter_columns: vec![None; 1],
+        }]
     }
 }
 
diff --git a/starky/src/get_challenges.rs b/starky/src/get_challenges.rs
index b34b427d08..5f9beddc3e 100644
--- a/starky/src/get_challenges.rs
+++ b/starky/src/get_challenges.rs
@@ -12,16 +12,13 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::config::{AlgebraicHasher, GenericConfig};
 
 use crate::config::StarkConfig;
-use crate::permutation::{
-    get_n_permutation_challenge_sets, get_n_permutation_challenge_sets_target,
-};
+use crate::lookup::{get_grand_product_challenge_set, get_grand_product_challenge_set_target};
 use crate::proof::*;
 use crate::stark::Stark;
 
-fn get_challenges<F, C, S, const D: usize>(
-    stark: &S,
+fn get_challenges<F, C, const D: usize>(
     trace_cap: &MerkleCap<F, C::Hasher>,
-    permutation_zs_cap: Option<&MerkleCap<F, C::Hasher>>,
+    auxiliary_polys_cap: Option<&MerkleCap<F, C::Hasher>>,
     quotient_polys_cap: &MerkleCap<F, C::Hasher>,
     openings: &StarkOpeningSet<F, D>,
     commit_phase_merkle_caps: &[MerkleCap<F, C::Hasher>],
@@ -33,7 +30,6 @@ fn get_challenges<F, C, S, const D: usize>(
 where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
-    S: Stark<F, D>,
 {
     let num_challenges = config.num_challenges;
 
@@ -41,13 +37,9 @@ where
 
     challenger.observe_cap(trace_cap);
 
-    let permutation_challenge_sets = permutation_zs_cap.map(|permutation_zs_cap| {
-        let tmp = get_n_permutation_challenge_sets(
-            &mut challenger,
-            num_challenges,
-            stark.permutation_batch_size(),
-        );
-        challenger.observe_cap(permutation_zs_cap);
+    let lookup_challenge_set = auxiliary_polys_cap.map(|auxiliary_polys_cap| {
+        let tmp = get_grand_product_challenge_set(&mut challenger, num_challenges);
+        challenger.observe_cap(auxiliary_polys_cap);
         tmp
     });
 
@@ -59,7 +51,7 @@ where
     challenger.observe_openings(&openings.to_fri_openings());
 
     StarkProofChallenges {
-        permutation_challenge_sets,
+        lookup_challenge_set,
         stark_alphas,
         stark_zeta,
         fri_challenges: challenger.fri_challenges::<C, D>(
@@ -79,27 +71,21 @@ where
 {
     // TODO: Should be used later in compression?
     #![allow(dead_code)]
-    pub(crate) fn fri_query_indices<S: Stark<F, D>>(
-        &self,
-        stark: &S,
-        config: &StarkConfig,
-        degree_bits: usize,
-    ) -> Vec<usize> {
-        self.get_challenges(stark, config, degree_bits)
+    pub(crate) fn fri_query_indices(&self, config: &StarkConfig, degree_bits: usize) -> Vec<usize> {
+        self.get_challenges(config, degree_bits)
             .fri_challenges
             .fri_query_indices
     }
 
     /// Computes all Fiat-Shamir challenges used in the STARK proof.
-    pub(crate) fn get_challenges<S: Stark<F, D>>(
+    pub(crate) fn get_challenges(
         &self,
-        stark: &S,
         config: &StarkConfig,
         degree_bits: usize,
     ) -> StarkProofChallenges<F, D> {
         let StarkProof {
             trace_cap,
-            permutation_zs_cap,
+            auxiliary_polys_cap,
             quotient_polys_cap,
             openings,
             opening_proof:
@@ -111,10 +97,9 @@ where
                 },
         } = &self.proof;
 
-        get_challenges::<F, C, S, D>(
-            stark,
+        get_challenges::<F, C, D>(
             trace_cap,
-            permutation_zs_cap.as_ref(),
+            auxiliary_polys_cap.as_ref(),
             quotient_polys_cap,
             openings,
             commit_phase_merkle_caps,
@@ -130,13 +115,11 @@ where
 pub(crate) fn get_challenges_target<
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
-    S: Stark<F, D>,
     const D: usize,
 >(
     builder: &mut CircuitBuilder<F, D>,
-    stark: &S,
     trace_cap: &MerkleCapTarget,
-    permutation_zs_cap: Option<&MerkleCapTarget>,
+    auxiliary_polys_cap: Option<&MerkleCapTarget>,
     quotient_polys_cap: &MerkleCapTarget,
     openings: &StarkOpeningSetTarget<D>,
     commit_phase_merkle_caps: &[MerkleCapTarget],
@@ -153,13 +136,8 @@ where
 
     challenger.observe_cap(trace_cap);
 
-    let permutation_challenge_sets = permutation_zs_cap.map(|permutation_zs_cap| {
-        let tmp = get_n_permutation_challenge_sets_target(
-            builder,
-            &mut challenger,
-            num_challenges,
-            stark.permutation_batch_size(),
-        );
+    let lookup_challenge_set = auxiliary_polys_cap.map(|permutation_zs_cap| {
+        let tmp = get_grand_product_challenge_set_target(builder, &mut challenger, num_challenges);
         challenger.observe_cap(permutation_zs_cap);
         tmp
     });
@@ -172,7 +150,7 @@ where
     challenger.observe_openings(&openings.to_fri_openings());
 
     StarkProofChallengesTarget {
-        permutation_challenge_sets,
+        lookup_challenge_set,
         stark_alphas,
         stark_zeta,
         fri_challenges: challenger.fri_challenges(
@@ -186,22 +164,19 @@ where
 }
 
 impl<const D: usize> StarkProofWithPublicInputsTarget<D> {
-    pub(crate) fn get_challenges<
-        F: RichField + Extendable<D>,
-        C: GenericConfig<D, F = F>,
-        S: Stark<F, D>,
-    >(
+    pub(crate) fn get_challenges<F, C>(
         &self,
         builder: &mut CircuitBuilder<F, D>,
-        stark: &S,
         config: &StarkConfig,
     ) -> StarkProofChallengesTarget<D>
     where
+        F: RichField + Extendable<D>,
+        C: GenericConfig<D, F = F>,
         C::Hasher: AlgebraicHasher<F>,
     {
         let StarkProofTarget {
             trace_cap,
-            permutation_zs_cap,
+            auxiliary_polys_cap,
             quotient_polys_cap,
             openings,
             opening_proof:
@@ -213,11 +188,10 @@ impl<const D: usize> StarkProofWithPublicInputsTarget<D> {
                 },
         } = &self.proof;
 
-        get_challenges_target::<F, C, S, D>(
+        get_challenges_target::<F, C, D>(
             builder,
-            stark,
             trace_cap,
-            permutation_zs_cap.as_ref(),
+            auxiliary_polys_cap.as_ref(),
             quotient_polys_cap,
             openings,
             commit_phase_merkle_caps,
diff --git a/starky/src/lib.rs b/starky/src/lib.rs
index 635e57bd0b..f6b4f5e0c7 100644
--- a/starky/src/lib.rs
+++ b/starky/src/lib.rs
@@ -1,5 +1,6 @@
 #![allow(clippy::too_many_arguments)]
 #![allow(clippy::type_complexity)]
+#![allow(unused)] // TODO: Remove post code migration
 #![cfg_attr(not(feature = "std"), no_std)]
 
 extern crate alloc;
@@ -9,7 +10,7 @@ mod get_challenges;
 pub mod config;
 pub mod constraint_consumer;
 pub mod evaluation_frame;
-pub mod permutation;
+pub mod lookup;
 pub mod proof;
 pub mod prover;
 pub mod recursive_verifier;
diff --git a/starky/src/lookup.rs b/starky/src/lookup.rs
new file mode 100644
index 0000000000..19f2042481
--- /dev/null
+++ b/starky/src/lookup.rs
@@ -0,0 +1,1002 @@
+use alloc::vec;
+use alloc::vec::Vec;
+use core::borrow::Borrow;
+use core::fmt::Debug;
+use core::iter::repeat;
+
+use itertools::Itertools;
+use num_bigint::BigUint;
+use plonky2::field::batch_util::batch_add_inplace;
+use plonky2::field::extension::{Extendable, FieldExtension};
+use plonky2::field::packed::PackedField;
+use plonky2::field::polynomial::PolynomialValues;
+use plonky2::field::types::Field;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::challenger::{Challenger, RecursiveChallenger};
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::iop::target::Target;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2::plonk::config::{AlgebraicHasher, Hasher};
+use plonky2::plonk::plonk_common::{
+    reduce_with_powers, reduce_with_powers_circuit, reduce_with_powers_ext_circuit,
+};
+use plonky2::util::serialization::{Buffer, IoResult, Read, Write};
+use plonky2_util::ceil_div_usize;
+
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::evaluation_frame::StarkEvaluationFrame;
+use crate::stark::Stark;
+
+/// Represents a filter, which evaluates to 1 if the row must be considered and 0 if it should be ignored.
+/// It's an arbitrary degree 2 combination of columns: `products` are the degree 2 terms, and `constants` are
+/// the degree 1 terms.
+#[derive(Clone, Debug)]
+pub struct Filter<F: Field> {
+    products: Vec<(Column<F>, Column<F>)>,
+    constants: Vec<Column<F>>,
+}
+
+impl<F: Field> Filter<F> {
+    pub fn new(products: Vec<(Column<F>, Column<F>)>, constants: Vec<Column<F>>) -> Self {
+        Self {
+            products,
+            constants,
+        }
+    }
+
+    /// Returns a filter made of a single column.
+    pub fn new_simple(col: Column<F>) -> Self {
+        Self {
+            products: vec![],
+            constants: vec![col],
+        }
+    }
+
+    /// Given the column values for the current and next rows, evaluates the filter.
+    pub(crate) fn eval_filter<FE, P, const D: usize>(&self, v: &[P], next_v: &[P]) -> P
+    where
+        FE: FieldExtension<D, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+    {
+        self.products
+            .iter()
+            .map(|(col1, col2)| col1.eval_with_next(v, next_v) * col2.eval_with_next(v, next_v))
+            .sum::<P>()
+            + self
+                .constants
+                .iter()
+                .map(|col| col.eval_with_next(v, next_v))
+                .sum::<P>()
+    }
+
+    /// Circuit version of `eval_filter`:
+    /// Given the column values for the current and next rows, evaluates the filter.
+    pub(crate) fn eval_filter_circuit<const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        v: &[ExtensionTarget<D>],
+        next_v: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D>
+    where
+        F: RichField + Extendable<D>,
+    {
+        let prods = self
+            .products
+            .iter()
+            .map(|(col1, col2)| {
+                let col1_eval = col1.eval_with_next_circuit(builder, v, next_v);
+                let col2_eval = col2.eval_with_next_circuit(builder, v, next_v);
+                builder.mul_extension(col1_eval, col2_eval)
+            })
+            .collect::<Vec<_>>();
+
+        let consts = self
+            .constants
+            .iter()
+            .map(|col| col.eval_with_next_circuit(builder, v, next_v))
+            .collect::<Vec<_>>();
+
+        let prods = builder.add_many_extension(prods);
+        let consts = builder.add_many_extension(consts);
+        builder.add_extension(prods, consts)
+    }
+
+    /// Evaluate on a row of a table given in column-major form.
+    pub(crate) fn eval_table(&self, table: &[PolynomialValues<F>], row: usize) -> F {
+        self.products
+            .iter()
+            .map(|(col1, col2)| col1.eval_table(table, row) * col2.eval_table(table, row))
+            .sum::<F>()
+            + self
+                .constants
+                .iter()
+                .map(|col| col.eval_table(table, row))
+                .sum()
+    }
+
+    pub(crate) fn eval_all_rows(&self, table: &[PolynomialValues<F>]) -> Vec<F> {
+        let length = table[0].len();
+
+        (0..length)
+            .map(|row| self.eval_table(table, row))
+            .collect::<Vec<F>>()
+    }
+}
+
+/// Represent two linear combination of columns, corresponding to the current and next row values.
+/// Each linear combination is represented as:
+/// - a vector of `(usize, F)` corresponding to the column number and the associated multiplicand
+/// - the constant of the linear combination.
+#[derive(Clone, Debug)]
+pub struct Column<F: Field> {
+    linear_combination: Vec<(usize, F)>,
+    next_row_linear_combination: Vec<(usize, F)>,
+    constant: F,
+}
+
+impl<F: Field> Column<F> {
+    /// Returns the representation of a single column in the current row.
+    pub fn single(c: usize) -> Self {
+        Self {
+            linear_combination: vec![(c, F::ONE)],
+            next_row_linear_combination: vec![],
+            constant: F::ZERO,
+        }
+    }
+
+    /// Returns multiple single columns in the current row.
+    pub fn singles<I: IntoIterator<Item = impl Borrow<usize>>>(
+        cs: I,
+    ) -> impl Iterator<Item = Self> {
+        cs.into_iter().map(|c| Self::single(*c.borrow()))
+    }
+
+    /// Returns the representation of a single column in the next row.
+    pub fn single_next_row(c: usize) -> Self {
+        Self {
+            linear_combination: vec![],
+            next_row_linear_combination: vec![(c, F::ONE)],
+            constant: F::ZERO,
+        }
+    }
+
+    /// Returns multiple single columns for the next row.
+    pub fn singles_next_row<I: IntoIterator<Item = impl Borrow<usize>>>(
+        cs: I,
+    ) -> impl Iterator<Item = Self> {
+        cs.into_iter().map(|c| Self::single_next_row(*c.borrow()))
+    }
+
+    /// Returns a linear combination corresponding to a constant.
+    pub fn constant(constant: F) -> Self {
+        Self {
+            linear_combination: vec![],
+            next_row_linear_combination: vec![],
+            constant,
+        }
+    }
+
+    /// Returns a linear combination corresponding to 0.
+    pub fn zero() -> Self {
+        Self::constant(F::ZERO)
+    }
+
+    /// Returns a linear combination corresponding to 1.
+    pub fn one() -> Self {
+        Self::constant(F::ONE)
+    }
+
+    /// Given an iterator of `(usize, F)` and a constant, returns the association linear combination of columns for the current row.
+    pub fn linear_combination_with_constant<I: IntoIterator<Item = (usize, F)>>(
+        iter: I,
+        constant: F,
+    ) -> Self {
+        let v = iter.into_iter().collect::<Vec<_>>();
+        assert!(!v.is_empty());
+
+        // Because this is a debug assertion, we only check it when the `std`
+        // feature is activated, as `Itertools::unique` relies on collections.
+        #[cfg(feature = "std")]
+        debug_assert_eq!(
+            v.iter().map(|(c, _)| c).unique().count(),
+            v.len(),
+            "Duplicate columns."
+        );
+
+        Self {
+            linear_combination: v,
+            next_row_linear_combination: vec![],
+            constant,
+        }
+    }
+
+    /// Given an iterator of `(usize, F)` and a constant, returns the associated linear combination of columns for the current and the next rows.
+    pub fn linear_combination_and_next_row_with_constant<I: IntoIterator<Item = (usize, F)>>(
+        iter: I,
+        next_row_iter: I,
+        constant: F,
+    ) -> Self {
+        let v = iter.into_iter().collect::<Vec<_>>();
+        let next_row_v = next_row_iter.into_iter().collect::<Vec<_>>();
+
+        assert!(!v.is_empty() || !next_row_v.is_empty());
+
+        // Because these are debug assertions, we only check them when the `std`
+        // feature is activated, as `Itertools::unique` relies on collections.
+        #[cfg(feature = "std")]
+        {
+            debug_assert_eq!(
+                v.iter().map(|(c, _)| c).unique().count(),
+                v.len(),
+                "Duplicate columns."
+            );
+            debug_assert_eq!(
+                next_row_v.iter().map(|(c, _)| c).unique().count(),
+                next_row_v.len(),
+                "Duplicate columns."
+            );
+        }
+
+        Self {
+            linear_combination: v,
+            next_row_linear_combination: next_row_v,
+            constant,
+        }
+    }
+
+    /// Returns a linear combination of columns, with no additional constant.
+    pub fn linear_combination<I: IntoIterator<Item = (usize, F)>>(iter: I) -> Self {
+        Self::linear_combination_with_constant(iter, F::ZERO)
+    }
+
+    /// Given an iterator of columns (c_0, ..., c_n) containing bits in little endian order:
+    /// returns the representation of c_0 + 2 * c_1 + ... + 2^n * c_n.
+    pub fn le_bits<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
+        Self::linear_combination(cs.into_iter().map(|c| *c.borrow()).zip(F::TWO.powers()))
+    }
+
+    /// Given an iterator of columns (c_0, ..., c_n) containing bits in little endian order:
+    /// returns the representation of c_0 + 2 * c_1 + ... + 2^n * c_n + k where `k` is an
+    /// additional constant.
+    pub fn le_bits_with_constant<I: IntoIterator<Item = impl Borrow<usize>>>(
+        cs: I,
+        constant: F,
+    ) -> Self {
+        Self::linear_combination_with_constant(
+            cs.into_iter().map(|c| *c.borrow()).zip(F::TWO.powers()),
+            constant,
+        )
+    }
+
+    /// Given an iterator of columns (c_0, ..., c_n) containing bytes in little endian order:
+    /// returns the representation of c_0 + 256 * c_1 + ... + 256^n * c_n.
+    pub fn le_bytes<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
+        Self::linear_combination(
+            cs.into_iter()
+                .map(|c| *c.borrow())
+                .zip(F::from_canonical_u16(256).powers()),
+        )
+    }
+
+    /// Given an iterator of columns, returns the representation of their sum.
+    pub fn sum<I: IntoIterator<Item = impl Borrow<usize>>>(cs: I) -> Self {
+        Self::linear_combination(cs.into_iter().map(|c| *c.borrow()).zip(repeat(F::ONE)))
+    }
+
+    /// Given the column values for the current row, returns the evaluation of the linear combination.
+    pub(crate) fn eval<FE, P, const D: usize>(&self, v: &[P]) -> P
+    where
+        FE: FieldExtension<D, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+    {
+        self.linear_combination
+            .iter()
+            .map(|&(c, f)| v[c] * FE::from_basefield(f))
+            .sum::<P>()
+            + FE::from_basefield(self.constant)
+    }
+
+    /// Given the column values for the current and next rows, evaluates the current and next linear combinations and returns their sum.
+    pub(crate) fn eval_with_next<FE, P, const D: usize>(&self, v: &[P], next_v: &[P]) -> P
+    where
+        FE: FieldExtension<D, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+    {
+        self.linear_combination
+            .iter()
+            .map(|&(c, f)| v[c] * FE::from_basefield(f))
+            .sum::<P>()
+            + self
+                .next_row_linear_combination
+                .iter()
+                .map(|&(c, f)| next_v[c] * FE::from_basefield(f))
+                .sum::<P>()
+            + FE::from_basefield(self.constant)
+    }
+
+    /// Evaluate on a row of a table given in column-major form.
+    pub(crate) fn eval_table(&self, table: &[PolynomialValues<F>], row: usize) -> F {
+        let mut res = self
+            .linear_combination
+            .iter()
+            .map(|&(c, f)| table[c].values[row] * f)
+            .sum::<F>()
+            + self.constant;
+
+        // If we access the next row at the last row, for sanity, we consider the next row's values to be 0.
+        // If the lookups are correctly written, the filter should be 0 in that case anyway.
+        if !self.next_row_linear_combination.is_empty() && row < table[0].values.len() - 1 {
+            res += self
+                .next_row_linear_combination
+                .iter()
+                .map(|&(c, f)| table[c].values[row + 1] * f)
+                .sum::<F>();
+        }
+
+        res
+    }
+
+    /// Evaluates the column on all rows.
+    pub(crate) fn eval_all_rows(&self, table: &[PolynomialValues<F>]) -> Vec<F> {
+        let length = table[0].len();
+        (0..length)
+            .map(|row| self.eval_table(table, row))
+            .collect::<Vec<F>>()
+    }
+
+    /// Circuit version of `eval`: Given a row's targets, returns their linear combination.
+    pub(crate) fn eval_circuit<const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        v: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D>
+    where
+        F: RichField + Extendable<D>,
+    {
+        let pairs = self
+            .linear_combination
+            .iter()
+            .map(|&(c, f)| {
+                (
+                    v[c],
+                    builder.constant_extension(F::Extension::from_basefield(f)),
+                )
+            })
+            .collect::<Vec<_>>();
+        let constant = builder.constant_extension(F::Extension::from_basefield(self.constant));
+        builder.inner_product_extension(F::ONE, constant, pairs)
+    }
+
+    /// Circuit version of `eval_with_next`:
+    /// Given the targets of the current and next row, returns the sum of their linear combinations.
+    pub(crate) fn eval_with_next_circuit<const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        v: &[ExtensionTarget<D>],
+        next_v: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D>
+    where
+        F: RichField + Extendable<D>,
+    {
+        let mut pairs = self
+            .linear_combination
+            .iter()
+            .map(|&(c, f)| {
+                (
+                    v[c],
+                    builder.constant_extension(F::Extension::from_basefield(f)),
+                )
+            })
+            .collect::<Vec<_>>();
+        let next_row_pairs = self.next_row_linear_combination.iter().map(|&(c, f)| {
+            (
+                next_v[c],
+                builder.constant_extension(F::Extension::from_basefield(f)),
+            )
+        });
+        pairs.extend(next_row_pairs);
+        let constant = builder.constant_extension(F::Extension::from_basefield(self.constant));
+        builder.inner_product_extension(F::ONE, constant, pairs)
+    }
+}
+
+pub(crate) type ColumnFilter<'a, F> = (&'a [Column<F>], &'a Option<Filter<F>>);
+
+pub struct Lookup<F: Field> {
+    /// Columns whose values should be contained in the lookup table.
+    /// These are the f_i(x) polynomials in the logUp paper.
+    pub columns: Vec<Column<F>>,
+    /// Column containing the lookup table.
+    /// This is the t(x) polynomial in the paper.
+    pub table_column: Column<F>,
+    /// Column containing the frequencies of `columns` in `table_column`.
+    /// This is the m(x) polynomial in the paper.
+    pub frequencies_column: Column<F>,
+
+    /// Columns to filter some elements. There is at most one filter
+    /// column per column to lookup.
+    pub filter_columns: Vec<Option<Filter<F>>>,
+}
+
+impl<F: Field> Lookup<F> {
+    pub fn num_helper_columns(&self, constraint_degree: usize) -> usize {
+        // One helper column for each column batch of size `constraint_degree-1`,
+        // then one column for the inverse of `table + challenge` and one for the `Z` polynomial.
+        ceil_div_usize(self.columns.len(), constraint_degree - 1) + 1
+    }
+}
+
+/// Randomness for a single instance of a permutation check protocol.
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub(crate) struct GrandProductChallenge<T: Copy + Eq + PartialEq + Debug> {
+    /// Randomness used to combine multiple columns into one.
+    pub(crate) beta: T,
+    /// Random offset that's added to the beta-reduced column values.
+    pub(crate) gamma: T,
+}
+
+impl<F: Field> GrandProductChallenge<F> {
+    pub(crate) fn combine<'a, FE, P, T: IntoIterator<Item = &'a P>, const D2: usize>(
+        &self,
+        terms: T,
+    ) -> P
+    where
+        FE: FieldExtension<D2, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+        T::IntoIter: DoubleEndedIterator,
+    {
+        reduce_with_powers(terms, FE::from_basefield(self.beta)) + FE::from_basefield(self.gamma)
+    }
+}
+
+impl GrandProductChallenge<Target> {
+    pub(crate) fn combine_circuit<F: RichField + Extendable<D>, const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        terms: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D> {
+        let reduced = reduce_with_powers_ext_circuit(builder, terms, self.beta);
+        let gamma = builder.convert_to_ext(self.gamma);
+        builder.add_extension(reduced, gamma)
+    }
+}
+
+impl GrandProductChallenge<Target> {
+    pub(crate) fn combine_base_circuit<F: RichField + Extendable<D>, const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        terms: &[Target],
+    ) -> Target {
+        let reduced = reduce_with_powers_circuit(builder, terms, self.beta);
+        builder.add(reduced, self.gamma)
+    }
+}
+
+/// Like `GrandProductChallenge`, but with `num_challenges` copies to boost soundness.
+#[derive(Clone, Eq, PartialEq, Debug)]
+pub struct GrandProductChallengeSet<T: Copy + Eq + PartialEq + Debug> {
+    pub(crate) challenges: Vec<GrandProductChallenge<T>>,
+}
+
+impl GrandProductChallengeSet<Target> {
+    pub(crate) fn to_buffer(&self, buffer: &mut Vec<u8>) -> IoResult<()> {
+        buffer.write_usize(self.challenges.len())?;
+        for challenge in &self.challenges {
+            buffer.write_target(challenge.beta)?;
+            buffer.write_target(challenge.gamma)?;
+        }
+        Ok(())
+    }
+
+    pub(crate) fn from_buffer(buffer: &mut Buffer) -> IoResult<Self> {
+        let length = buffer.read_usize()?;
+        let mut challenges = Vec::with_capacity(length);
+        for _ in 0..length {
+            challenges.push(GrandProductChallenge {
+                beta: buffer.read_target()?,
+                gamma: buffer.read_target()?,
+            });
+        }
+
+        Ok(GrandProductChallengeSet { challenges })
+    }
+}
+
+fn get_grand_product_challenge<F: RichField, H: Hasher<F>>(
+    challenger: &mut Challenger<F, H>,
+) -> GrandProductChallenge<F> {
+    let beta = challenger.get_challenge();
+    let gamma = challenger.get_challenge();
+    GrandProductChallenge { beta, gamma }
+}
+
+pub(crate) fn get_grand_product_challenge_set<F: RichField, H: Hasher<F>>(
+    challenger: &mut Challenger<F, H>,
+    num_challenges: usize,
+) -> GrandProductChallengeSet<F> {
+    let challenges = (0..num_challenges)
+        .map(|_| get_grand_product_challenge(challenger))
+        .collect();
+    GrandProductChallengeSet { challenges }
+}
+
+fn get_grand_product_challenge_target<
+    F: RichField + Extendable<D>,
+    H: AlgebraicHasher<F>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    challenger: &mut RecursiveChallenger<F, H, D>,
+) -> GrandProductChallenge<Target> {
+    let beta = challenger.get_challenge(builder);
+    let gamma = challenger.get_challenge(builder);
+    GrandProductChallenge { beta, gamma }
+}
+
+pub(crate) fn get_grand_product_challenge_set_target<
+    F: RichField + Extendable<D>,
+    H: AlgebraicHasher<F>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    challenger: &mut RecursiveChallenger<F, H, D>,
+    num_challenges: usize,
+) -> GrandProductChallengeSet<Target> {
+    let challenges = (0..num_challenges)
+        .map(|_| get_grand_product_challenge_target(builder, challenger))
+        .collect();
+    GrandProductChallengeSet { challenges }
+}
+
+/// logUp protocol from <https://ia.cr/2022/1530>
+/// Compute the helper columns for the lookup argument.
+/// Given columns `f0,...,fk` and a column `t`, such that `∪fi ⊆ t`, and challenges `x`,
+/// this computes the helper columns `h_i = 1/(x+f_2i) + 1/(x+f_2i+1)`, `g = 1/(x+t)`,
+/// and `Z(gx) = Z(x) + sum h_i(x) - m(x)g(x)` where `m` is the frequencies column.
+pub(crate) fn lookup_helper_columns<F: Field>(
+    lookup: &Lookup<F>,
+    trace_poly_values: &[PolynomialValues<F>],
+    challenge: F,
+    constraint_degree: usize,
+) -> Vec<PolynomialValues<F>> {
+    assert!(
+        constraint_degree == 2 || constraint_degree == 3,
+        "TODO: Allow other constraint degrees."
+    );
+
+    assert_eq!(lookup.columns.len(), lookup.filter_columns.len());
+
+    let num_total_logup_entries = trace_poly_values[0].values.len() * lookup.columns.len();
+    assert!(BigUint::from(num_total_logup_entries) < F::characteristic());
+
+    let num_helper_columns = lookup.num_helper_columns(constraint_degree);
+    let mut helper_columns: Vec<PolynomialValues<F>> = Vec::with_capacity(num_helper_columns);
+
+    let looking_cols = lookup
+        .columns
+        .iter()
+        .map(|col| vec![col.clone()])
+        .collect::<Vec<Vec<Column<F>>>>();
+
+    let grand_challenge = GrandProductChallenge {
+        beta: F::ONE,
+        gamma: challenge,
+    };
+
+    let columns_filters = looking_cols
+        .iter()
+        .zip(lookup.filter_columns.iter())
+        .map(|(col, filter)| (&col[..], filter))
+        .collect::<Vec<_>>();
+    // For each batch of `constraint_degree-1` columns `fi`, compute `sum 1/(f_i+challenge)` and
+    // add it to the helper columns.
+    // Note: these are the h_k(x) polynomials in the paper, with a few differences:
+    //       * Here, the first ratio m_0(x)/phi_0(x) is not included with the columns batched up to create the
+    //         h_k polynomials; instead there's a separate helper column for it (see below).
+    //       * Here, we use 1 instead of -1 as the numerator (and subtract later).
+    //       * Here, for now, the batch size (l) is always constraint_degree - 1 = 2.
+    //       * Here, there are filters for the columns, to only select some rows
+    //         in a given column.
+    let mut helper_columns = get_helper_cols(
+        trace_poly_values,
+        trace_poly_values[0].len(),
+        &columns_filters,
+        grand_challenge,
+        constraint_degree,
+    );
+
+    // Add `1/(table+challenge)` to the helper columns.
+    // This is 1/phi_0(x) = 1/(x + t(x)) from the paper.
+    // Here, we don't include m(x) in the numerator, instead multiplying it with this column later.
+    let mut table = lookup.table_column.eval_all_rows(trace_poly_values);
+    for x in table.iter_mut() {
+        *x = challenge + *x;
+    }
+    let table_inverse: Vec<F> = F::batch_multiplicative_inverse(&table);
+
+    // Compute the `Z` polynomial with `Z(1)=0` and `Z(gx) = Z(x) + sum h_i(x) - frequencies(x)g(x)`.
+    // This enforces the check from the paper, that the sum of the h_k(x) polynomials is 0 over H.
+    // In the paper, that sum includes m(x)/(x + t(x)) = frequencies(x)/g(x), because that was bundled
+    // into the h_k(x) polynomials.
+    let frequencies = &lookup.frequencies_column.eval_all_rows(trace_poly_values);
+    let mut z = Vec::with_capacity(frequencies.len());
+    z.push(F::ZERO);
+    for i in 0..frequencies.len() - 1 {
+        let x = helper_columns[..num_helper_columns - 1]
+            .iter()
+            .map(|col| col.values[i])
+            .sum::<F>()
+            - frequencies[i] * table_inverse[i];
+        z.push(z[i] + x);
+    }
+    helper_columns.push(z.into());
+
+    helper_columns
+}
+
+/// Given data associated to a lookup, check the associated helper polynomials.
+pub(crate) fn eval_helper_columns<F, FE, P, const D: usize, const D2: usize>(
+    filter: &[Option<Filter<F>>],
+    columns: &[Vec<P>],
+    local_values: &[P],
+    next_values: &[P],
+    helper_columns: &[P],
+    constraint_degree: usize,
+    challenges: &GrandProductChallenge<F>,
+    consumer: &mut ConstraintConsumer<P>,
+) where
+    F: RichField + Extendable<D>,
+    FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
+{
+    if !helper_columns.is_empty() {
+        for (j, chunk) in columns.chunks(constraint_degree - 1).enumerate() {
+            let fs =
+                &filter[(constraint_degree - 1) * j..(constraint_degree - 1) * j + chunk.len()];
+            let h = helper_columns[j];
+
+            match chunk.len() {
+                2 => {
+                    let combin0 = challenges.combine(&chunk[0]);
+                    let combin1 = challenges.combine(chunk[1].iter());
+
+                    let f0 = if let Some(filter0) = &fs[0] {
+                        filter0.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+                    let f1 = if let Some(filter1) = &fs[1] {
+                        filter1.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+
+                    consumer.constraint(combin1 * combin0 * h - f0 * combin1 - f1 * combin0);
+                }
+                1 => {
+                    let combin = challenges.combine(&chunk[0]);
+                    let f0 = if let Some(filter1) = &fs[0] {
+                        filter1.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+                    consumer.constraint(combin * h - f0);
+                }
+
+                _ => todo!("Allow other constraint degrees"),
+            }
+        }
+    }
+}
+
+/// Circuit version of `eval_helper_columns`.
+/// Given data associated to a lookup (either a CTL or a range-check), check the associated helper polynomials.
+pub(crate) fn eval_helper_columns_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    filter: &[Option<Filter<F>>],
+    columns: &[Vec<ExtensionTarget<D>>],
+    local_values: &[ExtensionTarget<D>],
+    next_values: &[ExtensionTarget<D>],
+    helper_columns: &[ExtensionTarget<D>],
+    constraint_degree: usize,
+    challenges: &GrandProductChallenge<Target>,
+    consumer: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    if !helper_columns.is_empty() {
+        for (j, chunk) in columns.chunks(constraint_degree - 1).enumerate() {
+            let fs =
+                &filter[(constraint_degree - 1) * j..(constraint_degree - 1) * j + chunk.len()];
+            let h = helper_columns[j];
+
+            let one = builder.one_extension();
+            match chunk.len() {
+                2 => {
+                    let combin0 = challenges.combine_circuit(builder, &chunk[0]);
+                    let combin1 = challenges.combine_circuit(builder, &chunk[1]);
+
+                    let f0 = if let Some(filter0) = &fs[0] {
+                        filter0.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+                    let f1 = if let Some(filter1) = &fs[1] {
+                        filter1.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+
+                    let constr = builder.mul_sub_extension(combin0, h, f0);
+                    let constr = builder.mul_extension(constr, combin1);
+                    let f1_constr = builder.mul_extension(f1, combin0);
+                    let constr = builder.sub_extension(constr, f1_constr);
+
+                    consumer.constraint(builder, constr);
+                }
+                1 => {
+                    let combin = challenges.combine_circuit(builder, &chunk[0]);
+                    let f0 = if let Some(filter1) = &fs[0] {
+                        filter1.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+                    let constr = builder.mul_sub_extension(combin, h, f0);
+                    consumer.constraint(builder, constr);
+                }
+
+                _ => todo!("Allow other constraint degrees"),
+            }
+        }
+    }
+}
+
+/// Given a STARK's trace, and the data associated to one lookup (either CTL or range check),
+/// returns the associated helper polynomials.
+pub(crate) fn get_helper_cols<F: Field>(
+    trace: &[PolynomialValues<F>],
+    degree: usize,
+    columns_filters: &[ColumnFilter<F>],
+    challenge: GrandProductChallenge<F>,
+    constraint_degree: usize,
+) -> Vec<PolynomialValues<F>> {
+    let num_helper_columns = ceil_div_usize(columns_filters.len(), constraint_degree - 1);
+
+    let mut helper_columns = Vec::with_capacity(num_helper_columns);
+
+    let mut filter_index = 0;
+    for mut cols_filts in &columns_filters.iter().chunks(constraint_degree - 1) {
+        let (first_col, first_filter) = cols_filts.next().unwrap();
+
+        let mut filter_col = Vec::with_capacity(degree);
+        let first_combined = (0..degree)
+            .map(|d| {
+                let f = if let Some(filter) = first_filter {
+                    let f = filter.eval_table(trace, d);
+                    filter_col.push(f);
+                    f
+                } else {
+                    filter_col.push(F::ONE);
+                    F::ONE
+                };
+                if f.is_one() {
+                    let evals = first_col
+                        .iter()
+                        .map(|c| c.eval_table(trace, d))
+                        .collect::<Vec<F>>();
+                    challenge.combine(evals.iter())
+                } else {
+                    assert_eq!(f, F::ZERO, "Non-binary filter?");
+                    // Dummy value. Cannot be zero since it will be batch-inverted.
+                    F::ONE
+                }
+            })
+            .collect::<Vec<F>>();
+
+        let mut acc = F::batch_multiplicative_inverse(&first_combined);
+        for d in 0..degree {
+            if filter_col[d].is_zero() {
+                acc[d] = F::ZERO;
+            }
+        }
+
+        for (col, filt) in cols_filts {
+            let mut filter_col = Vec::with_capacity(degree);
+            let mut combined = (0..degree)
+                .map(|d| {
+                    let f = if let Some(filter) = filt {
+                        let f = filter.eval_table(trace, d);
+                        filter_col.push(f);
+                        f
+                    } else {
+                        filter_col.push(F::ONE);
+                        F::ONE
+                    };
+                    if f.is_one() {
+                        let evals = col
+                            .iter()
+                            .map(|c| c.eval_table(trace, d))
+                            .collect::<Vec<F>>();
+                        challenge.combine(evals.iter())
+                    } else {
+                        assert_eq!(f, F::ZERO, "Non-binary filter?");
+                        // Dummy value. Cannot be zero since it will be batch-inverted.
+                        F::ONE
+                    }
+                })
+                .collect::<Vec<F>>();
+
+            combined = F::batch_multiplicative_inverse(&combined);
+
+            for d in 0..degree {
+                if filter_col[d].is_zero() {
+                    combined[d] = F::ZERO;
+                }
+            }
+
+            batch_add_inplace(&mut acc, &combined);
+        }
+
+        helper_columns.push(acc.into());
+    }
+    assert_eq!(helper_columns.len(), num_helper_columns);
+
+    helper_columns
+}
+
+pub(crate) struct LookupCheckVars<F, FE, P, const D2: usize>
+where
+    F: Field,
+    FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
+{
+    pub(crate) local_values: Vec<P>,
+    pub(crate) next_values: Vec<P>,
+    pub(crate) challenges: Vec<F>,
+}
+
+/// Constraints for the logUp lookup argument.
+pub(crate) fn eval_packed_lookups_generic<F, FE, P, S, const D: usize, const D2: usize>(
+    stark: &S,
+    lookups: &[Lookup<F>],
+    vars: &S::EvaluationFrame<FE, P, D2>,
+    lookup_vars: LookupCheckVars<F, FE, P, D2>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) where
+    F: RichField + Extendable<D>,
+    FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
+    S: Stark<F, D>,
+{
+    let local_values = vars.get_local_values();
+    let next_values = vars.get_next_values();
+    let degree = stark.constraint_degree();
+    assert!(
+        degree == 2 || degree == 3,
+        "TODO: Allow other constraint degrees."
+    );
+    let mut start = 0;
+    for lookup in lookups {
+        let num_helper_columns = lookup.num_helper_columns(degree);
+        for &challenge in &lookup_vars.challenges {
+            let grand_challenge = GrandProductChallenge {
+                beta: F::ONE,
+                gamma: challenge,
+            };
+            let lookup_columns = lookup
+                .columns
+                .iter()
+                .map(|col| vec![col.eval_with_next(local_values, next_values)])
+                .collect::<Vec<Vec<P>>>();
+
+            // For each chunk, check that `h_i (x+f_2i) (x+f_{2i+1}) = (x+f_2i) * filter_{2i+1} + (x+f_{2i+1}) * filter_2i`
+            // if the chunk has length 2 or if it has length 1, check that `h_i * (x+f_2i) = filter_2i`, where x is the challenge
+            eval_helper_columns(
+                &lookup.filter_columns,
+                &lookup_columns,
+                local_values,
+                next_values,
+                &lookup_vars.local_values[start..start + num_helper_columns - 1],
+                degree,
+                &grand_challenge,
+                yield_constr,
+            );
+
+            let challenge = FE::from_basefield(challenge);
+
+            // Check the `Z` polynomial.
+            let z = lookup_vars.local_values[start + num_helper_columns - 1];
+            let next_z = lookup_vars.next_values[start + num_helper_columns - 1];
+            let table_with_challenge = lookup.table_column.eval(local_values) + challenge;
+            let y = lookup_vars.local_values[start..start + num_helper_columns - 1]
+                .iter()
+                .fold(P::ZEROS, |acc, x| acc + *x)
+                * table_with_challenge
+                - lookup.frequencies_column.eval(local_values);
+            // Check that in the first row, z = 0;
+            yield_constr.constraint_first_row(z);
+            yield_constr.constraint((next_z - z) * table_with_challenge - y);
+            start += num_helper_columns;
+        }
+    }
+}
+
+pub(crate) struct LookupCheckVarsTarget<const D: usize> {
+    pub(crate) local_values: Vec<ExtensionTarget<D>>,
+    pub(crate) next_values: Vec<ExtensionTarget<D>>,
+    pub(crate) challenges: Vec<Target>,
+}
+
+pub(crate) fn eval_ext_lookups_circuit<
+    F: RichField + Extendable<D>,
+    S: Stark<F, D>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    stark: &S,
+    vars: &S::EvaluationFrameTarget,
+    lookup_vars: LookupCheckVarsTarget<D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let one = builder.one_extension();
+    let degree = stark.constraint_degree();
+    let lookups = stark.lookups();
+
+    let local_values = vars.get_local_values();
+    let next_values = vars.get_next_values();
+    assert!(
+        degree == 2 || degree == 3,
+        "TODO: Allow other constraint degrees."
+    );
+    let mut start = 0;
+    for lookup in lookups {
+        let num_helper_columns = lookup.num_helper_columns(degree);
+        let col_values = lookup
+            .columns
+            .iter()
+            .map(|col| vec![col.eval_with_next_circuit(builder, local_values, next_values)])
+            .collect::<Vec<_>>();
+
+        for &challenge in &lookup_vars.challenges {
+            let grand_challenge = GrandProductChallenge {
+                beta: builder.one(),
+                gamma: challenge,
+            };
+
+            eval_helper_columns_circuit(
+                builder,
+                &lookup.filter_columns,
+                &col_values,
+                local_values,
+                next_values,
+                &lookup_vars.local_values[start..start + num_helper_columns - 1],
+                degree,
+                &grand_challenge,
+                yield_constr,
+            );
+            let challenge = builder.convert_to_ext(challenge);
+
+            let z = lookup_vars.local_values[start + num_helper_columns - 1];
+            let next_z = lookup_vars.next_values[start + num_helper_columns - 1];
+            let table_column = lookup
+                .table_column
+                .eval_circuit(builder, vars.get_local_values());
+            let table_with_challenge = builder.add_extension(table_column, challenge);
+            let mut y = builder.add_many_extension(
+                &lookup_vars.local_values[start..start + num_helper_columns - 1],
+            );
+
+            let frequencies_column = lookup
+                .frequencies_column
+                .eval_circuit(builder, vars.get_local_values());
+            y = builder.mul_extension(y, table_with_challenge);
+            y = builder.sub_extension(y, frequencies_column);
+
+            // Check that in the first row, z = 0;
+            yield_constr.constraint_first_row(builder, z);
+            let mut constraint = builder.sub_extension(next_z, z);
+            constraint = builder.mul_extension(constraint, table_with_challenge);
+            constraint = builder.sub_extension(constraint, y);
+            yield_constr.constraint(builder, constraint);
+            start += num_helper_columns;
+        }
+    }
+}
diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
deleted file mode 100644
index 1059a79b7f..0000000000
--- a/starky/src/permutation.rs
+++ /dev/null
@@ -1,398 +0,0 @@
-//! Permutation arguments.
-
-use alloc::vec;
-use alloc::vec::Vec;
-
-use itertools::Itertools;
-use plonky2::field::batch_util::batch_multiply_inplace;
-use plonky2::field::extension::{Extendable, FieldExtension};
-use plonky2::field::packed::PackedField;
-use plonky2::field::polynomial::PolynomialValues;
-use plonky2::field::types::Field;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::challenger::{Challenger, RecursiveChallenger};
-use plonky2::iop::ext_target::ExtensionTarget;
-use plonky2::iop::target::Target;
-use plonky2::plonk::circuit_builder::CircuitBuilder;
-use plonky2::plonk::config::{AlgebraicHasher, Hasher};
-use plonky2::util::reducing::{ReducingFactor, ReducingFactorTarget};
-use plonky2_maybe_rayon::*;
-
-use crate::config::StarkConfig;
-use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::evaluation_frame::StarkEvaluationFrame;
-use crate::stark::Stark;
-
-/// A pair of lists of columns, `lhs` and `rhs`, that should be permutations of one another.
-/// In particular, there should exist some permutation `pi` such that for any `i`,
-/// `trace[lhs[i]] = pi(trace[rhs[i]])`. Here `trace` denotes the trace in column-major form, so
-/// `trace[col]` is a column vector.
-pub struct PermutationPair {
-    /// Each entry contains two column indices, representing two columns which should be
-    /// permutations of one another.
-    pub column_pairs: Vec<(usize, usize)>,
-}
-
-impl PermutationPair {
-    pub fn singletons(lhs: usize, rhs: usize) -> Self {
-        Self {
-            column_pairs: vec![(lhs, rhs)],
-        }
-    }
-}
-
-/// A single instance of a permutation check protocol.
-pub(crate) struct PermutationInstance<'a, T: Copy> {
-    pub(crate) pair: &'a PermutationPair,
-    pub(crate) challenge: PermutationChallenge<T>,
-}
-
-/// Randomness for a single instance of a permutation check protocol.
-#[derive(Copy, Clone)]
-pub(crate) struct PermutationChallenge<T: Copy> {
-    /// Randomness used to combine multiple columns into one.
-    pub(crate) beta: T,
-    /// Random offset that's added to the beta-reduced column values.
-    pub(crate) gamma: T,
-}
-
-/// Like `PermutationChallenge`, but with `num_challenges` copies to boost soundness.
-#[derive(Clone)]
-pub(crate) struct PermutationChallengeSet<T: Copy> {
-    pub(crate) challenges: Vec<PermutationChallenge<T>>,
-}
-
-/// Compute all Z polynomials (for permutation arguments).
-pub(crate) fn compute_permutation_z_polys<F, S, const D: usize>(
-    stark: &S,
-    config: &StarkConfig,
-    trace_poly_values: &[PolynomialValues<F>],
-    permutation_challenge_sets: &[PermutationChallengeSet<F>],
-) -> Vec<PolynomialValues<F>>
-where
-    F: RichField + Extendable<D>,
-    S: Stark<F, D>,
-{
-    let permutation_pairs = stark.permutation_pairs();
-    let permutation_batches = get_permutation_batches(
-        &permutation_pairs,
-        permutation_challenge_sets,
-        config.num_challenges,
-        stark.permutation_batch_size(),
-    );
-
-    permutation_batches
-        .into_par_iter()
-        .map(|instances| compute_permutation_z_poly(&instances, trace_poly_values))
-        .collect()
-}
-
-/// Compute a single Z polynomial.
-fn compute_permutation_z_poly<F: Field>(
-    instances: &[PermutationInstance<F>],
-    trace_poly_values: &[PolynomialValues<F>],
-) -> PolynomialValues<F> {
-    let degree = trace_poly_values[0].len();
-    let (reduced_lhs_polys, reduced_rhs_polys): (Vec<_>, Vec<_>) = instances
-        .iter()
-        .map(|instance| permutation_reduced_polys(instance, trace_poly_values, degree))
-        .unzip();
-
-    let numerator = poly_product_elementwise(reduced_lhs_polys.into_iter());
-    let denominator = poly_product_elementwise(reduced_rhs_polys.into_iter());
-
-    // Compute the quotients.
-    let denominator_inverses = F::batch_multiplicative_inverse(&denominator.values);
-    let mut quotients = numerator.values;
-    batch_multiply_inplace(&mut quotients, &denominator_inverses);
-
-    // Compute Z, which contains partial products of the quotients.
-    let mut partial_products = Vec::with_capacity(degree);
-    let mut acc = F::ONE;
-    for q in quotients {
-        partial_products.push(acc);
-        acc *= q;
-    }
-    PolynomialValues::new(partial_products)
-}
-
-/// Computes the reduced polynomial, `\sum beta^i f_i(x) + gamma`, for both the "left" and "right"
-/// sides of a given `PermutationPair`.
-fn permutation_reduced_polys<F: Field>(
-    instance: &PermutationInstance<F>,
-    trace_poly_values: &[PolynomialValues<F>],
-    degree: usize,
-) -> (PolynomialValues<F>, PolynomialValues<F>) {
-    let PermutationInstance {
-        pair: PermutationPair { column_pairs },
-        challenge: PermutationChallenge { beta, gamma },
-    } = instance;
-
-    let mut reduced_lhs = PolynomialValues::constant(*gamma, degree);
-    let mut reduced_rhs = PolynomialValues::constant(*gamma, degree);
-    for ((lhs, rhs), weight) in column_pairs.iter().zip(beta.powers()) {
-        reduced_lhs.add_assign_scaled(&trace_poly_values[*lhs], weight);
-        reduced_rhs.add_assign_scaled(&trace_poly_values[*rhs], weight);
-    }
-    (reduced_lhs, reduced_rhs)
-}
-
-/// Computes the elementwise product of a set of polynomials. Assumes that the set is non-empty and
-/// that each polynomial has the same length.
-fn poly_product_elementwise<F: Field>(
-    mut polys: impl Iterator<Item = PolynomialValues<F>>,
-) -> PolynomialValues<F> {
-    let mut product = polys.next().expect("Expected at least one polynomial");
-    for poly in polys {
-        batch_multiply_inplace(&mut product.values, &poly.values)
-    }
-    product
-}
-
-fn get_permutation_challenge<F: RichField, H: Hasher<F>>(
-    challenger: &mut Challenger<F, H>,
-) -> PermutationChallenge<F> {
-    let beta = challenger.get_challenge();
-    let gamma = challenger.get_challenge();
-    PermutationChallenge { beta, gamma }
-}
-
-fn get_permutation_challenge_set<F: RichField, H: Hasher<F>>(
-    challenger: &mut Challenger<F, H>,
-    num_challenges: usize,
-) -> PermutationChallengeSet<F> {
-    let challenges = (0..num_challenges)
-        .map(|_| get_permutation_challenge(challenger))
-        .collect();
-    PermutationChallengeSet { challenges }
-}
-
-pub(crate) fn get_n_permutation_challenge_sets<F: RichField, H: Hasher<F>>(
-    challenger: &mut Challenger<F, H>,
-    num_challenges: usize,
-    num_sets: usize,
-) -> Vec<PermutationChallengeSet<F>> {
-    (0..num_sets)
-        .map(|_| get_permutation_challenge_set(challenger, num_challenges))
-        .collect()
-}
-
-fn get_permutation_challenge_target<
-    F: RichField + Extendable<D>,
-    H: AlgebraicHasher<F>,
-    const D: usize,
->(
-    builder: &mut CircuitBuilder<F, D>,
-    challenger: &mut RecursiveChallenger<F, H, D>,
-) -> PermutationChallenge<Target> {
-    let beta = challenger.get_challenge(builder);
-    let gamma = challenger.get_challenge(builder);
-    PermutationChallenge { beta, gamma }
-}
-
-fn get_permutation_challenge_set_target<
-    F: RichField + Extendable<D>,
-    H: AlgebraicHasher<F>,
-    const D: usize,
->(
-    builder: &mut CircuitBuilder<F, D>,
-    challenger: &mut RecursiveChallenger<F, H, D>,
-    num_challenges: usize,
-) -> PermutationChallengeSet<Target> {
-    let challenges = (0..num_challenges)
-        .map(|_| get_permutation_challenge_target(builder, challenger))
-        .collect();
-    PermutationChallengeSet { challenges }
-}
-
-pub(crate) fn get_n_permutation_challenge_sets_target<
-    F: RichField + Extendable<D>,
-    H: AlgebraicHasher<F>,
-    const D: usize,
->(
-    builder: &mut CircuitBuilder<F, D>,
-    challenger: &mut RecursiveChallenger<F, H, D>,
-    num_challenges: usize,
-    num_sets: usize,
-) -> Vec<PermutationChallengeSet<Target>> {
-    (0..num_sets)
-        .map(|_| get_permutation_challenge_set_target(builder, challenger, num_challenges))
-        .collect()
-}
-
-/// Get a list of instances of our batch-permutation argument. These are permutation arguments
-/// where the same `Z(x)` polynomial is used to check more than one permutation.
-/// Before batching, each permutation pair leads to `num_challenges` permutation arguments, so we
-/// start with the cartesian product of `permutation_pairs` and `0..num_challenges`. Then we
-/// chunk these arguments based on our batch size.
-pub(crate) fn get_permutation_batches<'a, T: Copy>(
-    permutation_pairs: &'a [PermutationPair],
-    permutation_challenge_sets: &[PermutationChallengeSet<T>],
-    num_challenges: usize,
-    batch_size: usize,
-) -> Vec<Vec<PermutationInstance<'a, T>>> {
-    permutation_pairs
-        .iter()
-        .cartesian_product(0..num_challenges)
-        .chunks(batch_size)
-        .into_iter()
-        .map(|batch| {
-            batch
-                .enumerate()
-                .map(|(i, (pair, chal))| {
-                    let challenge = permutation_challenge_sets[i].challenges[chal];
-                    PermutationInstance { pair, challenge }
-                })
-                .collect_vec()
-        })
-        .collect()
-}
-
-pub struct PermutationCheckVars<F, FE, P, const D2: usize>
-where
-    F: Field,
-    FE: FieldExtension<D2, BaseField = F>,
-    P: PackedField<Scalar = FE>,
-{
-    pub(crate) local_zs: Vec<P>,
-    pub(crate) next_zs: Vec<P>,
-    pub(crate) permutation_challenge_sets: Vec<PermutationChallengeSet<F>>,
-}
-
-pub(crate) fn eval_permutation_checks<F, FE, P, S, const D: usize, const D2: usize>(
-    stark: &S,
-    config: &StarkConfig,
-    vars: &S::EvaluationFrame<FE, P, D2>,
-    permutation_data: PermutationCheckVars<F, FE, P, D2>,
-    consumer: &mut ConstraintConsumer<P>,
-) where
-    F: RichField + Extendable<D>,
-    FE: FieldExtension<D2, BaseField = F>,
-    P: PackedField<Scalar = FE>,
-    S: Stark<F, D>,
-{
-    let local_values = vars.get_local_values();
-
-    let PermutationCheckVars {
-        local_zs,
-        next_zs,
-        permutation_challenge_sets,
-    } = permutation_data;
-
-    // Check that Z(1) = 1;
-    for &z in &local_zs {
-        consumer.constraint_first_row(z - FE::ONE);
-    }
-
-    let permutation_pairs = stark.permutation_pairs();
-
-    let permutation_batches = get_permutation_batches(
-        &permutation_pairs,
-        &permutation_challenge_sets,
-        config.num_challenges,
-        stark.permutation_batch_size(),
-    );
-
-    // Each zs value corresponds to a permutation batch.
-    for (i, instances) in permutation_batches.iter().enumerate() {
-        // Z(gx) * down = Z x  * up
-        let (reduced_lhs, reduced_rhs): (Vec<P>, Vec<P>) = instances
-            .iter()
-            .map(|instance| {
-                let PermutationInstance {
-                    pair: PermutationPair { column_pairs },
-                    challenge: PermutationChallenge { beta, gamma },
-                } = instance;
-                let mut factor = ReducingFactor::new(*beta);
-                let (lhs, rhs): (Vec<_>, Vec<_>) = column_pairs
-                    .iter()
-                    .map(|&(i, j)| (local_values[i], local_values[j]))
-                    .unzip();
-                (
-                    factor.reduce_ext(lhs.into_iter()) + FE::from_basefield(*gamma),
-                    factor.reduce_ext(rhs.into_iter()) + FE::from_basefield(*gamma),
-                )
-            })
-            .unzip();
-        let constraint = next_zs[i] * reduced_rhs.into_iter().product::<P>()
-            - local_zs[i] * reduced_lhs.into_iter().product::<P>();
-        consumer.constraint(constraint);
-    }
-}
-
-pub struct PermutationCheckDataTarget<const D: usize> {
-    pub(crate) local_zs: Vec<ExtensionTarget<D>>,
-    pub(crate) next_zs: Vec<ExtensionTarget<D>>,
-    pub(crate) permutation_challenge_sets: Vec<PermutationChallengeSet<Target>>,
-}
-
-pub(crate) fn eval_permutation_checks_circuit<F, S, const D: usize>(
-    builder: &mut CircuitBuilder<F, D>,
-    stark: &S,
-    config: &StarkConfig,
-    vars: &S::EvaluationFrameTarget,
-    permutation_data: PermutationCheckDataTarget<D>,
-    consumer: &mut RecursiveConstraintConsumer<F, D>,
-) where
-    F: RichField + Extendable<D>,
-    S: Stark<F, D>,
-{
-    let local_values = vars.get_local_values();
-
-    let PermutationCheckDataTarget {
-        local_zs,
-        next_zs,
-        permutation_challenge_sets,
-    } = permutation_data;
-
-    let one = builder.one_extension();
-    // Check that Z(1) = 1;
-    for &z in &local_zs {
-        let z_1 = builder.sub_extension(z, one);
-        consumer.constraint_first_row(builder, z_1);
-    }
-
-    let permutation_pairs = stark.permutation_pairs();
-
-    let permutation_batches = get_permutation_batches(
-        &permutation_pairs,
-        &permutation_challenge_sets,
-        config.num_challenges,
-        stark.permutation_batch_size(),
-    );
-
-    // Each zs value corresponds to a permutation batch.
-    for (i, instances) in permutation_batches.iter().enumerate() {
-        let (reduced_lhs, reduced_rhs): (Vec<ExtensionTarget<D>>, Vec<ExtensionTarget<D>>) =
-            instances
-                .iter()
-                .map(|instance| {
-                    let PermutationInstance {
-                        pair: PermutationPair { column_pairs },
-                        challenge: PermutationChallenge { beta, gamma },
-                    } = instance;
-                    let beta_ext = builder.convert_to_ext(*beta);
-                    let gamma_ext = builder.convert_to_ext(*gamma);
-                    let mut factor = ReducingFactorTarget::new(beta_ext);
-                    let (lhs, rhs): (Vec<_>, Vec<_>) = column_pairs
-                        .iter()
-                        .map(|&(i, j)| (local_values[i], local_values[j]))
-                        .unzip();
-                    let reduced_lhs = factor.reduce(&lhs, builder);
-                    let reduced_rhs = factor.reduce(&rhs, builder);
-                    (
-                        builder.add_extension(reduced_lhs, gamma_ext),
-                        builder.add_extension(reduced_rhs, gamma_ext),
-                    )
-                })
-                .unzip();
-        let reduced_lhs_product = builder.mul_many_extension(reduced_lhs);
-        let reduced_rhs_product = builder.mul_many_extension(reduced_rhs);
-        // constraint = next_zs[i] * reduced_rhs_product - local_zs[i] * reduced_lhs_product
-        let constraint = {
-            let tmp = builder.mul_extension(local_zs[i], reduced_lhs_product);
-            builder.mul_sub_extension(next_zs[i], reduced_rhs_product, tmp)
-        };
-        consumer.constraint(builder, constraint)
-    }
-}
diff --git a/starky/src/proof.rs b/starky/src/proof.rs
index 6bd5f78761..e22399288e 100644
--- a/starky/src/proof.rs
+++ b/starky/src/proof.rs
@@ -18,14 +18,14 @@ use plonky2::plonk::config::GenericConfig;
 use plonky2_maybe_rayon::*;
 
 use crate::config::StarkConfig;
-use crate::permutation::PermutationChallengeSet;
+use crate::lookup::GrandProductChallengeSet;
 
 #[derive(Debug, Clone)]
 pub struct StarkProof<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> {
     /// Merkle cap of LDEs of trace values.
     pub trace_cap: MerkleCap<F, C::Hasher>,
     /// Merkle cap of LDEs of permutation Z values.
-    pub permutation_zs_cap: Option<MerkleCap<F, C::Hasher>>,
+    pub auxiliary_polys_cap: Option<MerkleCap<F, C::Hasher>>,
     /// Merkle cap of LDEs of trace values.
     pub quotient_polys_cap: MerkleCap<F, C::Hasher>,
     /// Purported values of each polynomial at the challenge point.
@@ -48,7 +48,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> S
 
 pub struct StarkProofTarget<const D: usize> {
     pub trace_cap: MerkleCapTarget,
-    pub permutation_zs_cap: Option<MerkleCapTarget>,
+    pub auxiliary_polys_cap: Option<MerkleCapTarget>,
     pub quotient_polys_cap: MerkleCapTarget,
     pub openings: StarkOpeningSetTarget<D>,
     pub opening_proof: FriProofTarget<D>,
@@ -106,7 +106,7 @@ pub struct CompressedStarkProofWithPublicInputs<
 
 pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: usize> {
     /// Randomness used in any permutation arguments.
-    pub permutation_challenge_sets: Option<Vec<PermutationChallengeSet<F>>>,
+    pub lookup_challenge_set: Option<GrandProductChallengeSet<F>>,
 
     /// Random values used to combine STARK constraints.
     pub stark_alphas: Vec<F>,
@@ -118,7 +118,7 @@ pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: us
 }
 
 pub(crate) struct StarkProofChallengesTarget<const D: usize> {
-    pub permutation_challenge_sets: Option<Vec<PermutationChallengeSet<Target>>>,
+    pub lookup_challenge_set: Option<GrandProductChallengeSet<Target>>,
     pub stark_alphas: Vec<Target>,
     pub stark_zeta: ExtensionTarget<D>,
     pub fri_challenges: FriChallengesTarget<D>,
@@ -129,8 +129,8 @@ pub(crate) struct StarkProofChallengesTarget<const D: usize> {
 pub struct StarkOpeningSet<F: RichField + Extendable<D>, const D: usize> {
     pub local_values: Vec<F::Extension>,
     pub next_values: Vec<F::Extension>,
-    pub permutation_zs: Option<Vec<F::Extension>>,
-    pub permutation_zs_next: Option<Vec<F::Extension>>,
+    pub auxiliary_polys: Option<Vec<F::Extension>>,
+    pub auxiliary_polys_next: Option<Vec<F::Extension>>,
     pub quotient_polys: Vec<F::Extension>,
 }
 
@@ -139,7 +139,7 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
         zeta: F::Extension,
         g: F,
         trace_commitment: &PolynomialBatch<F, C, D>,
-        permutation_zs_commitment: Option<&PolynomialBatch<F, C, D>>,
+        auxiliary_polys_commitment: Option<&PolynomialBatch<F, C, D>>,
         quotient_commitment: &PolynomialBatch<F, C, D>,
     ) -> Self {
         let eval_commitment = |z: F::Extension, c: &PolynomialBatch<F, C, D>| {
@@ -152,8 +152,8 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
         Self {
             local_values: eval_commitment(zeta, trace_commitment),
             next_values: eval_commitment(zeta_next, trace_commitment),
-            permutation_zs: permutation_zs_commitment.map(|c| eval_commitment(zeta, c)),
-            permutation_zs_next: permutation_zs_commitment.map(|c| eval_commitment(zeta_next, c)),
+            auxiliary_polys: auxiliary_polys_commitment.map(|c| eval_commitment(zeta, c)),
+            auxiliary_polys_next: auxiliary_polys_commitment.map(|c| eval_commitment(zeta_next, c)),
             quotient_polys: eval_commitment(zeta, quotient_commitment),
         }
     }
@@ -163,7 +163,7 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
             values: self
                 .local_values
                 .iter()
-                .chain(self.permutation_zs.iter().flatten())
+                .chain(self.auxiliary_polys.iter().flatten())
                 .chain(&self.quotient_polys)
                 .copied()
                 .collect_vec(),
@@ -172,7 +172,7 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
             values: self
                 .next_values
                 .iter()
-                .chain(self.permutation_zs_next.iter().flatten())
+                .chain(self.auxiliary_polys_next.iter().flatten())
                 .copied()
                 .collect_vec(),
         };
@@ -185,8 +185,8 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
 pub struct StarkOpeningSetTarget<const D: usize> {
     pub local_values: Vec<ExtensionTarget<D>>,
     pub next_values: Vec<ExtensionTarget<D>>,
-    pub permutation_zs: Option<Vec<ExtensionTarget<D>>>,
-    pub permutation_zs_next: Option<Vec<ExtensionTarget<D>>>,
+    pub auxiliary_polys: Option<Vec<ExtensionTarget<D>>>,
+    pub auxiliary_polys_next: Option<Vec<ExtensionTarget<D>>>,
     pub quotient_polys: Vec<ExtensionTarget<D>>,
 }
 
@@ -196,7 +196,7 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
             values: self
                 .local_values
                 .iter()
-                .chain(self.permutation_zs.iter().flatten())
+                .chain(self.auxiliary_polys.iter().flatten())
                 .chain(&self.quotient_polys)
                 .copied()
                 .collect_vec(),
@@ -205,7 +205,7 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
             values: self
                 .next_values
                 .iter()
-                .chain(self.permutation_zs_next.iter().flatten())
+                .chain(self.auxiliary_polys_next.iter().flatten())
                 .copied()
                 .collect_vec(),
         };
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index 23808e0f4e..f9b40217d6 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -21,14 +21,14 @@ use plonky2_maybe_rayon::*;
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
 use crate::evaluation_frame::StarkEvaluationFrame;
-use crate::permutation::{
-    compute_permutation_z_polys, get_n_permutation_challenge_sets, PermutationChallengeSet,
-    PermutationCheckVars,
+use crate::lookup::{
+    get_grand_product_challenge_set, lookup_helper_columns, Lookup, LookupCheckVars,
 };
 use crate::proof::{StarkOpeningSet, StarkProof, StarkProofWithPublicInputs};
 use crate::stark::Stark;
 use crate::vanishing_poly::eval_vanishing_poly;
 
+#[allow(clippy::useless_asref)]
 pub fn prove<F, C, S, const D: usize>(
     stark: S,
     config: &StarkConfig,
@@ -55,8 +55,6 @@ where
         timing,
         "compute trace commitment",
         PolynomialBatch::<F, C, D>::from_values(
-            // TODO: Cloning this isn't great; consider having `from_values` accept a reference,
-            // or having `compute_permutation_z_polys` read trace values from the `PolynomialBatch`.
             trace_poly_values.clone(),
             rate_bits,
             false,
@@ -70,25 +68,45 @@ where
     let mut challenger = Challenger::new();
     challenger.observe_cap(&trace_cap);
 
-    // Permutation arguments.
-    let permutation_zs_commitment_challenges = stark.uses_permutation_args().then(|| {
-        let permutation_challenge_sets = get_n_permutation_challenge_sets(
-            &mut challenger,
-            config.num_challenges,
-            stark.permutation_batch_size(),
-        );
-        let permutation_z_polys = compute_permutation_z_polys::<F, S, D>(
-            &stark,
-            config,
-            &trace_poly_values,
-            &permutation_challenge_sets,
-        );
+    // Lookup argument.
+    let constraint_degree = stark.constraint_degree();
+    let lookups = stark.lookups();
+    let lookup_challenges = stark.uses_lookups().then(|| {
+        get_grand_product_challenge_set(&mut challenger, config.num_challenges)
+            .challenges
+            .iter()
+            .map(|ch| ch.beta)
+            .collect::<Vec<_>>()
+    });
+
+    let num_lookup_columns = lookups
+        .iter()
+        .map(|l| l.num_helper_columns(constraint_degree))
+        .sum();
 
-        let permutation_zs_commitment = timed!(
+    let auxiliary_polys_commitment = stark.uses_lookups().then(|| {
+        let lookup_helper_columns = timed!(timing, "compute lookup helper columns", {
+            let challenges = lookup_challenges.as_ref().expect("We do have challenges.");
+            let mut columns = Vec::with_capacity(num_lookup_columns);
+            for lookup in &lookups {
+                for &challenge in challenges {
+                    columns.extend(lookup_helper_columns(
+                        lookup,
+                        &trace_poly_values,
+                        challenge,
+                        constraint_degree,
+                    ));
+                }
+            }
+            columns
+        });
+
+        // Get the polynomial commitments for all auxiliary polynomials.
+        let auxiliary_polys_commitment = timed!(
             timing,
             "compute permutation Z commitments",
             PolynomialBatch::from_values(
-                permutation_z_polys,
+                lookup_helper_columns,
                 rate_bits,
                 false,
                 config.fri_config.cap_height,
@@ -96,38 +114,68 @@ where
                 None,
             )
         );
-        (permutation_zs_commitment, permutation_challenge_sets)
+
+        auxiliary_polys_commitment
     });
-    let permutation_zs_commitment = permutation_zs_commitment_challenges
-        .as_ref()
-        .map(|(comm, _)| comm);
-    let permutation_zs_cap = permutation_zs_commitment
+
+    let auxiliary_polys_cap = auxiliary_polys_commitment
         .as_ref()
         .map(|commit| commit.merkle_tree.cap.clone());
-    if let Some(cap) = &permutation_zs_cap {
+    if let Some(cap) = &auxiliary_polys_cap {
         challenger.observe_cap(cap);
     }
 
     let alphas = challenger.get_n_challenges(config.num_challenges);
-    let quotient_polys = compute_quotient_polys::<F, <F as Packable>::Packing, C, S, D>(
-        &stark,
-        &trace_commitment,
-        &permutation_zs_commitment_challenges,
-        public_inputs,
-        alphas,
-        degree_bits,
-        config,
+
+    #[cfg(test)]
+    {
+        check_constraints(
+            &stark,
+            &trace_commitment,
+            public_inputs,
+            &auxiliary_polys_commitment,
+            lookup_challenges.as_ref(),
+            &lookups,
+            alphas.clone(),
+            degree_bits,
+            num_lookup_columns,
+        );
+    }
+
+    let quotient_polys = timed!(
+        timing,
+        "compute quotient polys",
+        compute_quotient_polys::<F, <F as Packable>::Packing, C, S, D>(
+            &stark,
+            &trace_commitment,
+            &auxiliary_polys_commitment,
+            lookup_challenges.as_ref(),
+            &lookups,
+            public_inputs,
+            alphas,
+            degree_bits,
+            num_lookup_columns,
+            config,
+        )
     );
-    let all_quotient_chunks = quotient_polys
-        .into_par_iter()
-        .flat_map(|mut quotient_poly| {
-            quotient_poly
-                .trim_to_len(degree * stark.quotient_degree_factor())
-                .expect("Quotient has failed, the vanishing polynomial is not divisible by Z_H");
-            // Split quotient into degree-n chunks.
-            quotient_poly.chunks(degree)
-        })
-        .collect();
+
+    let all_quotient_chunks = timed!(
+        timing,
+        "split quotient polys",
+        quotient_polys
+            .into_par_iter()
+            .flat_map(|mut quotient_poly| {
+                quotient_poly
+                    .trim_to_len(degree * stark.quotient_degree_factor())
+                    .expect(
+                        "Quotient has failed, the vanishing polynomial is not divisible by Z_H",
+                    );
+                // Split quotient into degree-n chunks.
+                quotient_poly.chunks(degree)
+            })
+            .collect()
+    );
+
     let quotient_commitment = timed!(
         timing,
         "compute quotient commitment",
@@ -140,6 +188,8 @@ where
             None,
         )
     );
+
+    // Observe the quotient polynomials Merkle cap.
     let quotient_polys_cap = quotient_commitment.merkle_tree.cap.clone();
     challenger.observe_cap(&quotient_polys_cap);
 
@@ -152,17 +202,21 @@ where
         zeta.exp_power_of_2(degree_bits) != F::Extension::ONE,
         "Opening point is in the subgroup."
     );
+
+    // Compute all openings: evaluate all committed polynomials at `zeta` and, when necessary, at `g * zeta`.
     let openings = StarkOpeningSet::new(
         zeta,
         g,
         &trace_commitment,
-        permutation_zs_commitment,
+        auxiliary_polys_commitment.as_ref(),
         &quotient_commitment,
     );
+
+    // Get the FRI openings and observe them.
     challenger.observe_openings(&openings.to_fri_openings());
 
     let initial_merkle_trees = once(&trace_commitment)
-        .chain(permutation_zs_commitment)
+        .chain(&auxiliary_polys_commitment)
         .chain(once(&quotient_commitment))
         .collect_vec();
 
@@ -179,7 +233,7 @@ where
     );
     let proof = StarkProof {
         trace_cap,
-        permutation_zs_cap,
+        auxiliary_polys_cap,
         quotient_polys_cap,
         openings,
         opening_proof,
@@ -196,13 +250,13 @@ where
 fn compute_quotient_polys<'a, F, P, C, S, const D: usize>(
     stark: &S,
     trace_commitment: &'a PolynomialBatch<F, C, D>,
-    permutation_zs_commitment_challenges: &'a Option<(
-        PolynomialBatch<F, C, D>,
-        Vec<PermutationChallengeSet<F>>,
-    )>,
+    auxiliary_polys_commitment: &'a Option<PolynomialBatch<F, C, D>>,
+    lookup_challenges: Option<&'a Vec<F>>,
+    lookups: &[Lookup<F>],
     public_inputs: &[F],
     alphas: Vec<F>,
     degree_bits: usize,
+    num_lookup_columns: usize,
     config: &StarkConfig,
 ) -> Vec<PolynomialCoeffs<F>>
 where
@@ -264,23 +318,35 @@ where
                 lagrange_basis_first,
                 lagrange_basis_last,
             );
+            // Get the local and next row evaluations for the current STARK,
+            // as well as the public inputs.
             let vars = S::EvaluationFrame::from_values(
                 &get_trace_values_packed(i_start),
                 &get_trace_values_packed(i_next_start),
                 public_inputs,
             );
-            let permutation_check_data = permutation_zs_commitment_challenges.as_ref().map(
-                |(permutation_zs_commitment, permutation_challenge_sets)| PermutationCheckVars {
-                    local_zs: permutation_zs_commitment.get_lde_values_packed(i_start, step),
-                    next_zs: permutation_zs_commitment.get_lde_values_packed(i_next_start, step),
-                    permutation_challenge_sets: permutation_challenge_sets.to_vec(),
-                },
-            );
+            // Get the local and next row evaluations for the permutation argument,
+            // as well as the associated challenges.
+            let lookup_vars = lookup_challenges.map(|challenges| LookupCheckVars {
+                local_values: auxiliary_polys_commitment
+                    .as_ref()
+                    .unwrap()
+                    .get_lde_values_packed(i_start, step)
+                    .to_vec(),
+                next_values: auxiliary_polys_commitment
+                    .as_ref()
+                    .unwrap()
+                    .get_lde_values_packed(i_next_start, step),
+                challenges: challenges.to_vec(),
+            });
+
+            // Evaluate the polynomial combining all constraints, including
+            // those associated to the permutation arguments.
             eval_vanishing_poly::<F, F, P, S, D, 1>(
                 stark,
-                config,
                 &vars,
-                permutation_check_data,
+                lookups,
+                lookup_vars,
                 &mut consumer,
             );
 
@@ -308,3 +374,102 @@ where
         .map(|values| values.coset_ifft(F::coset_shift()))
         .collect()
 }
+
+#[cfg(test)]
+/// Check that all constraints evaluate to zero on `H`.
+/// Can also be used to check the degree of the constraints by evaluating on a larger subgroup.
+fn check_constraints<'a, F, C, S, const D: usize>(
+    stark: &S,
+    trace_commitment: &'a PolynomialBatch<F, C, D>,
+    public_inputs: &[F],
+    auxiliary_commitment: &'a Option<PolynomialBatch<F, C, D>>,
+    lookup_challenges: Option<&'a Vec<F>>,
+    lookups: &[Lookup<F>],
+    alphas: Vec<F>,
+    degree_bits: usize,
+    num_lookup_columns: usize,
+) where
+    F: RichField + Extendable<D>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+{
+    let degree = 1 << degree_bits;
+    let rate_bits = 0; // Set this to higher value to check constraint degree.
+
+    let size = degree << rate_bits;
+    let step = 1 << rate_bits;
+
+    // Evaluation of the first Lagrange polynomial.
+    let lagrange_first = PolynomialValues::selector(degree, 0).lde(rate_bits);
+    // Evaluation of the last Lagrange polynomial.
+    let lagrange_last = PolynomialValues::selector(degree, degree - 1).lde(rate_bits);
+
+    let subgroup = F::two_adic_subgroup(degree_bits + rate_bits);
+
+    // Get the evaluations of a batch of polynomials over our subgroup.
+    let get_subgroup_evals = |comm: &PolynomialBatch<F, C, D>| -> Vec<Vec<F>> {
+        let values = comm
+            .polynomials
+            .par_iter()
+            .map(|coeffs| coeffs.clone().fft().values)
+            .collect::<Vec<_>>();
+        transpose(&values)
+    };
+
+    // Get batch evaluations of the trace and permutation polynomials over our subgroup.
+    let trace_subgroup_evals = get_subgroup_evals(trace_commitment);
+    let auxiliary_subgroup_evals = auxiliary_commitment.as_ref().map(get_subgroup_evals);
+
+    // Last element of the subgroup.
+    let last = F::primitive_root_of_unity(degree_bits).inverse();
+
+    let constraint_values = (0..size)
+        .map(|i| {
+            let i_next = (i + step) % size;
+
+            let x = subgroup[i];
+            let z_last = x - last;
+            let lagrange_basis_first = lagrange_first.values[i];
+            let lagrange_basis_last = lagrange_last.values[i];
+
+            let mut consumer = ConstraintConsumer::new(
+                alphas.clone(),
+                z_last,
+                lagrange_basis_first,
+                lagrange_basis_last,
+            );
+            // Get the local and next row evaluations for the current STARK's trace.
+            let vars = S::EvaluationFrame::from_values(
+                &trace_subgroup_evals[i],
+                &trace_subgroup_evals[i_next],
+                public_inputs,
+            );
+            // Get the local and next row evaluations for the current STARK's permutation argument.
+            let lookup_vars = lookup_challenges.map(|challenges| LookupCheckVars {
+                local_values: auxiliary_subgroup_evals.as_ref().unwrap()[i].clone(),
+                next_values: auxiliary_subgroup_evals.as_ref().unwrap()[i_next].clone(),
+                challenges: challenges.to_vec(),
+            });
+
+            // Evaluate the polynomial combining all constraints, including those associated
+            // to the permutation arguments.
+            eval_vanishing_poly::<F, F, F, S, D, 1>(
+                stark,
+                &vars,
+                lookups,
+                lookup_vars,
+                &mut consumer,
+            );
+            consumer.accumulators()
+        })
+        .collect::<Vec<_>>();
+
+    // Assert that all constraints evaluate to 0 over our subgroup.
+    for v in constraint_values {
+        assert!(
+            v.iter().all(|x| x.is_zero()),
+            "Constraint failed in {}",
+            core::any::type_name::<S>()
+        );
+    }
+}
diff --git a/starky/src/recursive_verifier.rs b/starky/src/recursive_verifier.rs
index bd5d2f1916..e91583f19b 100644
--- a/starky/src/recursive_verifier.rs
+++ b/starky/src/recursive_verifier.rs
@@ -1,3 +1,5 @@
+use alloc::vec;
+use alloc::vec::Vec;
 use core::iter::once;
 
 use anyhow::{ensure, Result};
@@ -16,7 +18,7 @@ use plonky2::with_context;
 use crate::config::StarkConfig;
 use crate::constraint_consumer::RecursiveConstraintConsumer;
 use crate::evaluation_frame::StarkEvaluationFrame;
-use crate::permutation::PermutationCheckDataTarget;
+use crate::lookup::LookupCheckVarsTarget;
 use crate::proof::{
     StarkOpeningSetTarget, StarkProof, StarkProofChallengesTarget, StarkProofTarget,
     StarkProofWithPublicInputs, StarkProofWithPublicInputsTarget,
@@ -42,7 +44,7 @@ pub fn verify_stark_proof_circuit<
     let challenges = with_context!(
         builder,
         "compute challenges",
-        proof_with_pis.get_challenges::<F, C, S>(builder, &stark, inner_config)
+        proof_with_pis.get_challenges::<F, C>(builder, inner_config)
     );
 
     verify_stark_proof_with_challenges_circuit::<F, C, S, D>(
@@ -71,7 +73,7 @@ fn verify_stark_proof_with_challenges_circuit<
 ) where
     C::Hasher: AlgebraicHasher<F>,
 {
-    check_permutation_options(&stark, &proof_with_pis, &challenges).unwrap();
+    check_lookup_options(&stark, &proof_with_pis, &challenges).unwrap();
     let one = builder.one_extension();
 
     let StarkProofWithPublicInputsTarget {
@@ -81,8 +83,8 @@ fn verify_stark_proof_with_challenges_circuit<
     let StarkOpeningSetTarget {
         local_values,
         next_values,
-        permutation_zs,
-        permutation_zs_next,
+        auxiliary_polys,
+        auxiliary_polys_next,
         quotient_polys,
     } = &proof.openings;
 
@@ -111,25 +113,27 @@ fn verify_stark_proof_with_challenges_circuit<
         l_last,
     );
 
-    let permutation_data = stark
-        .uses_permutation_args()
-        .then(|| PermutationCheckDataTarget {
-            local_zs: permutation_zs.as_ref().unwrap().clone(),
-            next_zs: permutation_zs_next.as_ref().unwrap().clone(),
-            permutation_challenge_sets: challenges.permutation_challenge_sets.unwrap(),
-        });
+    let num_lookup_columns = stark.num_lookup_helper_columns(inner_config);
+    let lookup_challenges = stark.uses_lookups().then(|| {
+        challenges
+            .lookup_challenge_set
+            .unwrap()
+            .challenges
+            .iter()
+            .map(|ch| ch.beta)
+            .collect::<Vec<_>>()
+    });
+
+    let lookup_vars = stark.uses_lookups().then(|| LookupCheckVarsTarget {
+        local_values: auxiliary_polys.as_ref().unwrap()[..num_lookup_columns].to_vec(),
+        next_values: auxiliary_polys_next.as_ref().unwrap()[..num_lookup_columns].to_vec(),
+        challenges: lookup_challenges.unwrap(),
+    });
 
     with_context!(
         builder,
         "evaluate vanishing polynomial",
-        eval_vanishing_poly_circuit::<F, S, D>(
-            builder,
-            &stark,
-            inner_config,
-            &vars,
-            permutation_data,
-            &mut consumer,
-        )
+        eval_vanishing_poly_circuit::<F, S, D>(builder, &stark, &vars, lookup_vars, &mut consumer)
     );
     let vanishing_polys_zeta = consumer.accumulators();
 
@@ -145,7 +149,7 @@ fn verify_stark_proof_with_challenges_circuit<
     }
 
     let merkle_caps = once(proof.trace_cap)
-        .chain(proof.permutation_zs_cap)
+        .chain(proof.auxiliary_polys_cap)
         .chain(once(proof.quotient_polys_cap))
         .collect_vec();
 
@@ -211,22 +215,19 @@ pub fn add_virtual_stark_proof<F: RichField + Extendable<D>, S: Stark<F, D>, con
     let fri_params = config.fri_params(degree_bits);
     let cap_height = fri_params.config.cap_height;
 
-    let num_leaves_per_oracle = once(S::COLUMNS)
-        .chain(
-            stark
-                .uses_permutation_args()
-                .then(|| stark.num_permutation_batches(config)),
-        )
-        .chain(once(stark.quotient_degree_factor() * config.num_challenges))
-        .collect_vec();
+    let num_leaves_per_oracle = vec![
+        S::COLUMNS,
+        stark.num_lookup_helper_columns(config),
+        stark.quotient_degree_factor() * config.num_challenges,
+    ];
 
-    let permutation_zs_cap = stark
-        .uses_permutation_args()
+    let auxiliary_polys_cap = stark
+        .uses_lookups()
         .then(|| builder.add_virtual_cap(cap_height));
 
     StarkProofTarget {
         trace_cap: builder.add_virtual_cap(cap_height),
-        permutation_zs_cap,
+        auxiliary_polys_cap,
         quotient_polys_cap: builder.add_virtual_cap(cap_height),
         openings: add_stark_opening_set_target::<F, S, D>(builder, stark, config),
         opening_proof: builder.add_virtual_fri_proof(&num_leaves_per_oracle, &fri_params),
@@ -242,12 +243,12 @@ fn add_stark_opening_set_target<F: RichField + Extendable<D>, S: Stark<F, D>, co
     StarkOpeningSetTarget {
         local_values: builder.add_virtual_extension_targets(S::COLUMNS),
         next_values: builder.add_virtual_extension_targets(S::COLUMNS),
-        permutation_zs: stark
-            .uses_permutation_args()
-            .then(|| builder.add_virtual_extension_targets(stark.num_permutation_batches(config))),
-        permutation_zs_next: stark
-            .uses_permutation_args()
-            .then(|| builder.add_virtual_extension_targets(stark.num_permutation_batches(config))),
+        auxiliary_polys: stark.uses_lookups().then(|| {
+            builder.add_virtual_extension_targets(stark.num_lookup_helper_columns(config))
+        }),
+        auxiliary_polys_next: stark.uses_lookups().then(|| {
+            builder.add_virtual_extension_targets(stark.num_lookup_helper_columns(config))
+        }),
         quotient_polys: builder
             .add_virtual_extension_targets(stark.quotient_degree_factor() * num_challenges),
     }
@@ -296,33 +297,34 @@ pub fn set_stark_proof_target<F, C: GenericConfig<D, F = F>, W, const D: usize>(
         &proof.openings.to_fri_openings(),
     );
 
-    if let (Some(permutation_zs_cap_target), Some(permutation_zs_cap)) =
-        (&proof_target.permutation_zs_cap, &proof.permutation_zs_cap)
-    {
-        witness.set_cap_target(permutation_zs_cap_target, permutation_zs_cap);
+    if let (Some(auxiliary_polys_cap_target), Some(auxiliary_polys_cap)) = (
+        &proof_target.auxiliary_polys_cap,
+        &proof.auxiliary_polys_cap,
+    ) {
+        witness.set_cap_target(auxiliary_polys_cap_target, auxiliary_polys_cap);
     }
 
     set_fri_proof_target(witness, &proof_target.opening_proof, &proof.opening_proof);
 }
 
-/// Utility function to check that all permutation data wrapped in `Option`s are `Some` iff
+/// Utility function to check that all lookups data wrapped in `Option`s are `Some` iff
 /// the Stark uses a permutation argument.
-fn check_permutation_options<F: RichField + Extendable<D>, S: Stark<F, D>, const D: usize>(
+fn check_lookup_options<F: RichField + Extendable<D>, S: Stark<F, D>, const D: usize>(
     stark: &S,
     proof_with_pis: &StarkProofWithPublicInputsTarget<D>,
     challenges: &StarkProofChallengesTarget<D>,
 ) -> Result<()> {
     let options_is_some = [
-        proof_with_pis.proof.permutation_zs_cap.is_some(),
-        proof_with_pis.proof.openings.permutation_zs.is_some(),
-        proof_with_pis.proof.openings.permutation_zs_next.is_some(),
-        challenges.permutation_challenge_sets.is_some(),
+        proof_with_pis.proof.auxiliary_polys_cap.is_some(),
+        proof_with_pis.proof.openings.auxiliary_polys.is_some(),
+        proof_with_pis.proof.openings.auxiliary_polys_next.is_some(),
+        challenges.lookup_challenge_set.is_some(),
     ];
     ensure!(
         options_is_some
             .into_iter()
-            .all(|b| b == stark.uses_permutation_args()),
-        "Permutation data doesn't match with Stark configuration."
+            .all(|b| b == stark.uses_lookups()),
+        "Lookups data doesn't match with Stark configuration."
     );
     Ok(())
 }
diff --git a/starky/src/stark.rs b/starky/src/stark.rs
index aec37c59df..a9f2b2602f 100644
--- a/starky/src/stark.rs
+++ b/starky/src/stark.rs
@@ -3,6 +3,7 @@ use alloc::vec::Vec;
 
 use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::field::packed::PackedField;
+use plonky2::field::types::Field;
 use plonky2::fri::structure::{
     FriBatchInfo, FriBatchInfoTarget, FriInstanceInfo, FriInstanceInfoTarget, FriOracleInfo,
     FriPolynomialInfo,
@@ -10,12 +11,15 @@ use plonky2::fri::structure::{
 use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
-use plonky2::util::ceil_div_usize;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::evaluation_frame::StarkEvaluationFrame;
-use crate::permutation::PermutationPair;
+use crate::lookup::Lookup;
+
+const TRACE_ORACLE_INDEX: usize = 0;
+const AUXILIARY_ORACLE_INDEX: usize = 1;
+const QUOTIENT_ORACLE_INDEX: usize = 2;
 
 /// Represents a STARK system.
 pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
@@ -66,7 +70,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
 
     /// Evaluate constraints at a vector of points from the degree `D` extension field. This is like
     /// `eval_ext`, except in the context of a recursive circuit.
-    /// Note: constraints must be added through`yeld_constr.constraint(builder, constraint)` in the
+    /// Note: constraints must be added through`yield_constr.constraint(builder, constraint)` in the
     /// same order as they are given in `eval_packed_generic`.
     fn eval_ext_circuit(
         &self,
@@ -94,49 +98,47 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         g: F,
         config: &StarkConfig,
     ) -> FriInstanceInfo<F, D> {
-        let mut oracles = vec![];
-
-        let trace_info = FriPolynomialInfo::from_range(oracles.len(), 0..Self::COLUMNS);
-        oracles.push(FriOracleInfo {
+        let trace_oracle = FriOracleInfo {
             num_polys: Self::COLUMNS,
             blinding: false,
-        });
-
-        let permutation_zs_info = if self.uses_permutation_args() {
-            let num_z_polys = self.num_permutation_batches(config);
-            let polys = FriPolynomialInfo::from_range(oracles.len(), 0..num_z_polys);
-            oracles.push(FriOracleInfo {
-                num_polys: num_z_polys,
-                blinding: false,
-            });
-            polys
-        } else {
-            vec![]
         };
+        let trace_info = FriPolynomialInfo::from_range(TRACE_ORACLE_INDEX, 0..Self::COLUMNS);
+
+        let num_lookup_columns = self.num_lookup_helper_columns(config);
+        let num_auxiliary_polys = num_lookup_columns;
+        let auxiliary_oracle = FriOracleInfo {
+            num_polys: num_auxiliary_polys,
+            blinding: false,
+        };
+        let auxiliary_polys_info =
+            FriPolynomialInfo::from_range(AUXILIARY_ORACLE_INDEX, 0..num_auxiliary_polys);
 
-        let num_quotient_polys = self.quotient_degree_factor() * config.num_challenges;
-        let quotient_info = FriPolynomialInfo::from_range(oracles.len(), 0..num_quotient_polys);
-        oracles.push(FriOracleInfo {
+        let num_quotient_polys = self.num_quotient_polys(config);
+        let quotient_oracle = FriOracleInfo {
             num_polys: num_quotient_polys,
             blinding: false,
-        });
+        };
+        let quotient_info =
+            FriPolynomialInfo::from_range(QUOTIENT_ORACLE_INDEX, 0..num_quotient_polys);
 
         let zeta_batch = FriBatchInfo {
             point: zeta,
             polynomials: [
                 trace_info.clone(),
-                permutation_zs_info.clone(),
+                auxiliary_polys_info.clone(),
                 quotient_info,
             ]
             .concat(),
         };
         let zeta_next_batch = FriBatchInfo {
             point: zeta.scalar_mul(g),
-            polynomials: [trace_info, permutation_zs_info].concat(),
+            polynomials: [trace_info, auxiliary_polys_info].concat(),
         };
-        let batches = vec![zeta_batch, zeta_next_batch];
 
-        FriInstanceInfo { oracles, batches }
+        FriInstanceInfo {
+            oracles: vec![trace_oracle, auxiliary_oracle, quotient_oracle],
+            batches: vec![zeta_batch, zeta_next_batch],
+        }
     }
 
     /// Computes the FRI instance used to prove this Stark.
@@ -147,38 +149,34 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         g: F,
         config: &StarkConfig,
     ) -> FriInstanceInfoTarget<D> {
-        let mut oracles = vec![];
-
-        let trace_info = FriPolynomialInfo::from_range(oracles.len(), 0..Self::COLUMNS);
-        oracles.push(FriOracleInfo {
+        let trace_oracle = FriOracleInfo {
             num_polys: Self::COLUMNS,
             blinding: false,
-        });
-
-        let permutation_zs_info = if self.uses_permutation_args() {
-            let num_z_polys = self.num_permutation_batches(config);
-            let polys = FriPolynomialInfo::from_range(oracles.len(), 0..num_z_polys);
-            oracles.push(FriOracleInfo {
-                num_polys: num_z_polys,
-                blinding: false,
-            });
-            polys
-        } else {
-            vec![]
         };
+        let trace_info = FriPolynomialInfo::from_range(TRACE_ORACLE_INDEX, 0..Self::COLUMNS);
 
-        let num_quotient_polys = self.quotient_degree_factor() * config.num_challenges;
-        let quotient_info = FriPolynomialInfo::from_range(oracles.len(), 0..num_quotient_polys);
-        oracles.push(FriOracleInfo {
+        let num_lookup_columns = self.num_lookup_helper_columns(config);
+        let num_auxiliary_polys = num_lookup_columns;
+        let auxiliary_oracle = FriOracleInfo {
+            num_polys: num_auxiliary_polys,
+            blinding: false,
+        };
+        let auxiliary_polys_info =
+            FriPolynomialInfo::from_range(AUXILIARY_ORACLE_INDEX, 0..num_auxiliary_polys);
+
+        let num_quotient_polys = self.num_quotient_polys(config);
+        let quotient_oracle = FriOracleInfo {
             num_polys: num_quotient_polys,
             blinding: false,
-        });
+        };
+        let quotient_info =
+            FriPolynomialInfo::from_range(QUOTIENT_ORACLE_INDEX, 0..num_quotient_polys);
 
         let zeta_batch = FriBatchInfoTarget {
             point: zeta,
             polynomials: [
                 trace_info.clone(),
-                permutation_zs_info.clone(),
+                auxiliary_polys_info.clone(),
                 quotient_info,
             ]
             .concat(),
@@ -186,40 +184,28 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         let zeta_next = builder.mul_const_extension(g, zeta);
         let zeta_next_batch = FriBatchInfoTarget {
             point: zeta_next,
-            polynomials: [trace_info, permutation_zs_info].concat(),
+            polynomials: [trace_info, auxiliary_polys_info].concat(),
         };
-        let batches = vec![zeta_batch, zeta_next_batch];
 
-        FriInstanceInfoTarget { oracles, batches }
+        FriInstanceInfoTarget {
+            oracles: vec![trace_oracle, auxiliary_oracle, quotient_oracle],
+            batches: vec![zeta_batch, zeta_next_batch],
+        }
     }
 
-    /// Pairs of lists of columns that should be permutations of one another. A permutation argument
-    /// will be used for each such pair. Empty by default.
-    fn permutation_pairs(&self) -> Vec<PermutationPair> {
+    fn lookups(&self) -> Vec<Lookup<F>> {
         vec![]
     }
 
-    fn uses_permutation_args(&self) -> bool {
-        !self.permutation_pairs().is_empty()
-    }
-
-    /// The number of permutation argument instances that can be combined into a single constraint.
-    fn permutation_batch_size(&self) -> usize {
-        // The permutation argument constraints look like
-        //     Z(x) \prod(...) = Z(g x) \prod(...)
-        // where each product has a number of terms equal to the batch size. So our batch size
-        // should be one less than our constraint degree, which happens to be our quotient degree.
-        self.quotient_degree_factor()
-    }
-
-    fn num_permutation_instances(&self, config: &StarkConfig) -> usize {
-        self.permutation_pairs().len() * config.num_challenges
+    fn num_lookup_helper_columns(&self, config: &StarkConfig) -> usize {
+        self.lookups()
+            .iter()
+            .map(|lookup| lookup.num_helper_columns(self.constraint_degree()))
+            .sum::<usize>()
+            * config.num_challenges
     }
 
-    fn num_permutation_batches(&self, config: &StarkConfig) -> usize {
-        ceil_div_usize(
-            self.num_permutation_instances(config),
-            self.permutation_batch_size(),
-        )
+    fn uses_lookups(&self) -> bool {
+        !self.lookups().is_empty()
     }
 }
diff --git a/starky/src/vanishing_poly.rs b/starky/src/vanishing_poly.rs
index 0a399dce53..6a179fe27a 100644
--- a/starky/src/vanishing_poly.rs
+++ b/starky/src/vanishing_poly.rs
@@ -3,19 +3,18 @@ use plonky2::field::packed::PackedField;
 use plonky2::hash::hash_types::RichField;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 
-use crate::config::StarkConfig;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::permutation::{
-    eval_permutation_checks, eval_permutation_checks_circuit, PermutationCheckDataTarget,
-    PermutationCheckVars,
+use crate::lookup::{
+    eval_ext_lookups_circuit, eval_packed_lookups_generic, Lookup, LookupCheckVars,
+    LookupCheckVarsTarget,
 };
 use crate::stark::Stark;
 
 pub(crate) fn eval_vanishing_poly<F, FE, P, S, const D: usize, const D2: usize>(
     stark: &S,
-    config: &StarkConfig,
     vars: &S::EvaluationFrame<FE, P, D2>,
-    permutation_data: Option<PermutationCheckVars<F, FE, P, D2>>,
+    lookups: &[Lookup<F>],
+    lookup_vars: Option<LookupCheckVars<F, FE, P, D2>>,
     consumer: &mut ConstraintConsumer<P>,
 ) where
     F: RichField + Extendable<D>,
@@ -24,12 +23,13 @@ pub(crate) fn eval_vanishing_poly<F, FE, P, S, const D: usize, const D2: usize>(
     S: Stark<F, D>,
 {
     stark.eval_packed_generic(vars, consumer);
-    if let Some(permutation_data) = permutation_data {
-        eval_permutation_checks::<F, FE, P, S, D, D2>(
+    if let Some(lookup_vars) = lookup_vars {
+        // Evaluate the STARK constraints related to the permutation arguments.
+        eval_packed_lookups_generic::<F, FE, P, S, D, D2>(
             stark,
-            config,
+            lookups,
             vars,
-            permutation_data,
+            lookup_vars,
             consumer,
         );
     }
@@ -38,23 +38,16 @@ pub(crate) fn eval_vanishing_poly<F, FE, P, S, const D: usize, const D2: usize>(
 pub(crate) fn eval_vanishing_poly_circuit<F, S, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     stark: &S,
-    config: &StarkConfig,
     vars: &S::EvaluationFrameTarget,
-    permutation_data: Option<PermutationCheckDataTarget<D>>,
+    lookup_vars: Option<LookupCheckVarsTarget<D>>,
     consumer: &mut RecursiveConstraintConsumer<F, D>,
 ) where
     F: RichField + Extendable<D>,
     S: Stark<F, D>,
 {
     stark.eval_ext_circuit(builder, vars, consumer);
-    if let Some(permutation_data) = permutation_data {
-        eval_permutation_checks_circuit::<F, S, D>(
-            builder,
-            stark,
-            config,
-            vars,
-            permutation_data,
-            consumer,
-        );
+    if let Some(lookup_vars) = lookup_vars {
+        // Evaluate all of the STARK's constraints related to the permutation argument.
+        eval_ext_lookups_circuit::<F, S, D>(builder, stark, vars, lookup_vars, consumer);
     }
 }
diff --git a/starky/src/verifier.rs b/starky/src/verifier.rs
index 28b9a3e2b3..577405ef4f 100644
--- a/starky/src/verifier.rs
+++ b/starky/src/verifier.rs
@@ -7,13 +7,14 @@ use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::field::types::Field;
 use plonky2::fri::verifier::verify_fri_proof;
 use plonky2::hash::hash_types::RichField;
+use plonky2::hash::merkle_tree::MerkleCap;
 use plonky2::plonk::config::GenericConfig;
 use plonky2::plonk::plonk_common::reduce_with_powers;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
 use crate::evaluation_frame::StarkEvaluationFrame;
-use crate::permutation::PermutationCheckVars;
+use crate::lookup::LookupCheckVars;
 use crate::proof::{StarkOpeningSet, StarkProof, StarkProofChallenges, StarkProofWithPublicInputs};
 use crate::stark::Stark;
 use crate::vanishing_poly::eval_vanishing_poly;
@@ -30,7 +31,7 @@ pub fn verify_stark_proof<
 ) -> Result<()> {
     ensure!(proof_with_pis.public_inputs.len() == S::PUBLIC_INPUTS);
     let degree_bits = proof_with_pis.proof.recover_degree_bits(config);
-    let challenges = proof_with_pis.get_challenges(&stark, config, degree_bits);
+    let challenges = proof_with_pis.get_challenges(config, degree_bits);
     verify_stark_proof_with_challenges(stark, proof_with_pis, challenges, degree_bits, config)
 }
 
@@ -47,7 +48,7 @@ pub(crate) fn verify_stark_proof_with_challenges<
     config: &StarkConfig,
 ) -> Result<()> {
     validate_proof_shape(&stark, &proof_with_pis, config)?;
-    check_permutation_options(&stark, &proof_with_pis, &challenges)?;
+
     let StarkProofWithPublicInputs {
         proof,
         public_inputs,
@@ -55,8 +56,8 @@ pub(crate) fn verify_stark_proof_with_challenges<
     let StarkOpeningSet {
         local_values,
         next_values,
-        permutation_zs,
-        permutation_zs_next,
+        auxiliary_polys,
+        auxiliary_polys_next,
         quotient_polys,
     } = &proof.openings;
     let vars = S::EvaluationFrame::from_values(
@@ -81,16 +82,30 @@ pub(crate) fn verify_stark_proof_with_challenges<
         l_0,
         l_last,
     );
-    let permutation_data = stark.uses_permutation_args().then(|| PermutationCheckVars {
-        local_zs: permutation_zs.as_ref().unwrap().clone(),
-        next_zs: permutation_zs_next.as_ref().unwrap().clone(),
-        permutation_challenge_sets: challenges.permutation_challenge_sets.unwrap(),
+
+    let num_lookup_columns = stark.num_lookup_helper_columns(config);
+    let lookup_challenges = (num_lookup_columns > 0).then(|| {
+        challenges
+            .lookup_challenge_set
+            .unwrap()
+            .challenges
+            .iter()
+            .map(|ch| ch.beta)
+            .collect::<Vec<_>>()
     });
+
+    let lookup_vars = stark.uses_lookups().then(|| LookupCheckVars {
+        local_values: auxiliary_polys.as_ref().unwrap().clone(),
+        next_values: auxiliary_polys_next.as_ref().unwrap().clone(),
+        challenges: lookup_challenges.unwrap(),
+    });
+    let lookups = stark.lookups();
+
     eval_vanishing_poly::<F, F::Extension, F::Extension, S, D, D>(
         &stark,
-        config,
         &vars,
-        permutation_data,
+        &lookups,
+        lookup_vars,
         &mut consumer,
     );
     let vanishing_polys_zeta = consumer.accumulators();
@@ -114,7 +129,7 @@ pub(crate) fn verify_stark_proof_with_challenges<
     }
 
     let merkle_caps = once(proof.trace_cap)
-        .chain(proof.permutation_zs_cap)
+        .chain(proof.auxiliary_polys_cap)
         .chain(once(proof.quotient_polys_cap))
         .collect_vec();
 
@@ -152,7 +167,7 @@ where
 
     let StarkProof {
         trace_cap,
-        permutation_zs_cap,
+        auxiliary_polys_cap,
         quotient_polys_cap,
         openings,
         // The shape of the opening proof will be checked in the FRI verifier (see
@@ -163,8 +178,8 @@ where
     let StarkOpeningSet {
         local_values,
         next_values,
-        permutation_zs,
-        permutation_zs_next,
+        auxiliary_polys,
+        auxiliary_polys_next,
         quotient_polys,
     } = openings;
 
@@ -172,7 +187,8 @@ where
 
     let fri_params = config.fri_params(degree_bits);
     let cap_height = fri_params.config.cap_height;
-    let num_zs = stark.num_permutation_batches(config);
+
+    let num_auxiliary = stark.num_lookup_helper_columns(config);
 
     ensure!(trace_cap.height() == cap_height);
     ensure!(quotient_polys_cap.height() == cap_height);
@@ -181,25 +197,13 @@ where
     ensure!(next_values.len() == S::COLUMNS);
     ensure!(quotient_polys.len() == stark.num_quotient_polys(config));
 
-    if stark.uses_permutation_args() {
-        let permutation_zs_cap = permutation_zs_cap
-            .as_ref()
-            .ok_or_else(|| anyhow!("Missing Zs cap"))?;
-        let permutation_zs = permutation_zs
-            .as_ref()
-            .ok_or_else(|| anyhow!("Missing permutation_zs"))?;
-        let permutation_zs_next = permutation_zs_next
-            .as_ref()
-            .ok_or_else(|| anyhow!("Missing permutation_zs_next"))?;
-
-        ensure!(permutation_zs_cap.height() == cap_height);
-        ensure!(permutation_zs.len() == num_zs);
-        ensure!(permutation_zs_next.len() == num_zs);
-    } else {
-        ensure!(permutation_zs_cap.is_none());
-        ensure!(permutation_zs.is_none());
-        ensure!(permutation_zs_next.is_none());
-    }
+    check_lookup_options::<F, C, S, D>(
+        stark,
+        auxiliary_polys_cap,
+        auxiliary_polys,
+        auxiliary_polys_next,
+        config,
+    )?;
 
     Ok(())
 }
@@ -216,30 +220,43 @@ fn eval_l_0_and_l_last<F: Field>(log_n: usize, x: F) -> (F, F) {
     (z_x * invs[0], z_x * invs[1])
 }
 
-/// Utility function to check that all permutation data wrapped in `Option`s are `Some` iff
+/// Utility function to check that all lookups data wrapped in `Option`s are `Some` iff
 /// the Stark uses a permutation argument.
-fn check_permutation_options<
+fn check_lookup_options<
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
     S: Stark<F, D>,
     const D: usize,
 >(
     stark: &S,
-    proof_with_pis: &StarkProofWithPublicInputs<F, C, D>,
-    challenges: &StarkProofChallenges<F, D>,
+    auxiliary_polys_cap: &Option<MerkleCap<F, <C as GenericConfig<D>>::Hasher>>,
+    auxiliary_polys: &Option<Vec<<F as Extendable<D>>::Extension>>,
+    auxiliary_polys_next: &Option<Vec<<F as Extendable<D>>::Extension>>,
+    config: &StarkConfig,
 ) -> Result<()> {
-    let options_is_some = [
-        proof_with_pis.proof.permutation_zs_cap.is_some(),
-        proof_with_pis.proof.openings.permutation_zs.is_some(),
-        proof_with_pis.proof.openings.permutation_zs_next.is_some(),
-        challenges.permutation_challenge_sets.is_some(),
-    ];
-    ensure!(
-        options_is_some
-            .into_iter()
-            .all(|b| b == stark.uses_permutation_args()),
-        "Permutation data doesn't match with Stark configuration."
-    );
+    if stark.uses_lookups() {
+        let num_auxiliary = stark.num_lookup_helper_columns(config);
+        let cap_height = config.fri_config.cap_height;
+
+        let auxiliary_polys_cap = auxiliary_polys_cap
+            .as_ref()
+            .ok_or_else(|| anyhow!("Missing auxiliary_polys_cap"))?;
+        let auxiliary_polys = auxiliary_polys
+            .as_ref()
+            .ok_or_else(|| anyhow!("Missing auxiliary_polys"))?;
+        let auxiliary_polys_next = auxiliary_polys_next
+            .as_ref()
+            .ok_or_else(|| anyhow!("Missing auxiliary_polys_next"))?;
+
+        ensure!(auxiliary_polys_cap.height() == cap_height);
+        ensure!(auxiliary_polys.len() == num_auxiliary);
+        ensure!(auxiliary_polys_next.len() == num_auxiliary);
+    } else {
+        ensure!(auxiliary_polys_cap.is_none());
+        ensure!(auxiliary_polys.is_none());
+        ensure!(auxiliary_polys_next.is_none());
+    }
+
     Ok(())
 }
 
diff --git a/util/.cargo/katex-header.html b/util/.cargo/katex-header.html
new file mode 100644
index 0000000000..20723b5d27
--- /dev/null
+++ b/util/.cargo/katex-header.html
@@ -0,0 +1 @@
+../../.cargo/katex-header.html
\ No newline at end of file
diff --git a/util/Cargo.toml b/util/Cargo.toml
index 1ece1e574f..758391c3b9 100644
--- a/util/Cargo.toml
+++ b/util/Cargo.toml
@@ -7,3 +7,7 @@ edition = "2021"
 
 [dev-dependencies]
 rand = { version = "0.8.5", default-features = false, features = ["getrandom"] }
+
+# Display math equations properly in documentation
+[package.metadata.docs.rs]
+rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
diff --git a/util/src/lib.rs b/util/src/lib.rs
index 6cccd29c2b..613bf6ef5a 100644
--- a/util/src/lib.rs
+++ b/util/src/lib.rs
@@ -1,6 +1,3 @@
-#![allow(clippy::new_without_default)]
-#![allow(clippy::too_many_arguments)]
-#![allow(clippy::type_complexity)]
 #![allow(clippy::needless_range_loop)]
 #![no_std]
 
@@ -15,7 +12,7 @@ use crate::transpose_util::transpose_in_place_square;
 
 mod transpose_util;
 
-pub fn bits_u64(n: u64) -> usize {
+pub const fn bits_u64(n: u64) -> usize {
     (64 - n.leading_zeros()) as usize
 }
 
@@ -25,7 +22,7 @@ pub const fn ceil_div_usize(a: usize, b: usize) -> usize {
 
 /// Computes `ceil(log_2(n))`.
 #[must_use]
-pub fn log2_ceil(n: usize) -> usize {
+pub const fn log2_ceil(n: usize) -> usize {
     (usize::BITS - n.saturating_sub(1).leading_zeros()) as usize
 }
 
@@ -113,9 +110,12 @@ fn reverse_index_bits_large<T: Copy>(arr: &[T], n_power: usize) -> Vec<T> {
 unsafe fn reverse_index_bits_in_place_small<T>(arr: &mut [T], lb_n: usize) {
     if lb_n <= 6 {
         // BIT_REVERSE_6BIT holds 6-bit reverses. This shift makes them lb_n-bit reverses.
-        let dst_shr_amt = 6 - lb_n;
+        let dst_shr_amt = 6 - lb_n as u32;
         for src in 0..arr.len() {
-            let dst = (BIT_REVERSE_6BIT[src] as usize) >> dst_shr_amt;
+            // `wrapping_shr` handles the case when `arr.len() == 1`. In that case `src == 0`, so
+            // `src.reverse_bits() == 0`. `usize::wrapping_shr` by 64 is a no-op, but it gives the
+            // correct result.
+            let dst = (BIT_REVERSE_6BIT[src] as usize).wrapping_shr(dst_shr_amt);
             if src < dst {
                 swap(arr.get_unchecked_mut(src), arr.get_unchecked_mut(dst));
             }
@@ -124,11 +124,14 @@ unsafe fn reverse_index_bits_in_place_small<T>(arr: &mut [T], lb_n: usize) {
         // LLVM does not know that it does not need to reverse src at each iteration (which is
         // expensive on x86). We take advantage of the fact that the low bits of dst change rarely and the high
         // bits of dst are dependent only on the low bits of src.
-        let dst_lo_shr_amt = 64 - (lb_n - 6);
+        let dst_lo_shr_amt = usize::BITS - (lb_n - 6) as u32;
         let dst_hi_shl_amt = lb_n - 6;
         for src_chunk in 0..(arr.len() >> 6) {
             let src_hi = src_chunk << 6;
-            let dst_lo = src_chunk.reverse_bits() >> dst_lo_shr_amt;
+            // `wrapping_shr` handles the case when `arr.len() == 1`. In that case `src == 0`, so
+            // `src.reverse_bits() == 0`. `usize::wrapping_shr` by 64 is a no-op, but it gives the
+            // correct result.
+            let dst_lo = src_chunk.reverse_bits().wrapping_shr(dst_lo_shr_amt);
             for src_lo in 0..(1 << 6) {
                 let dst_hi = (BIT_REVERSE_6BIT[src_lo] as usize) << dst_hi_shl_amt;
                 let src = src_hi + src_lo;