diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..dc69733e1b
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,83 @@
+# Google C/C++ Code Style settings
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+# Author: Kehan Xue, kehan.xue (at) gmail.com
+
+Language: Cpp
+BasedOnStyle: Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: None
+AlignOperands: Align
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: Empty
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: Never  # To avoid conflict, set this "Never" and each "if statement" should include brace when coding
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterStruct: false
+  AfterControlStatement: Never
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  BeforeLambdaBody: false
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+ColumnLimit: 80
+CompactNamespaces: false
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false  # Make sure the * or & align on the left
+EmptyLineBeforeAccessModifier: LogicalBlock
+FixNamespaceComments: true
+IncludeBlocks: Preserve
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth: 2
+KeepEmptyLinesAtTheStartOfBlocks: true
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PointerAlignment: Left
+ReflowComments: false
+# SeparateDefinitionBlocks: Always   # Only support since clang-format 14
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: c++11
+TabWidth: 4
+UseTab: Never
\ No newline at end of file
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index d928b91df6..a6fbda6190 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -101,6 +101,16 @@ runs:
         else
           echo "pkg-config and libssl-dev are already installed."
         fi
-      
+
+    - name: Echo docker buildx version
+      shell: bash
+      run: docker buildx version
+
     - name: Set up Docker
+      uses: crazy-max/ghaction-setup-docker@v3
+
+    - name: Set up Docker Buildx
       uses: docker/setup-buildx-action@v3
+      with:
+        driver-opts: |
+          image=public.ecr.aws/vend/moby/buildkit:buildx-stable-1
\ No newline at end of file
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 0b63d294a4..7f494bae74 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -41,7 +41,7 @@ jobs:
         run: |
           cargo run -p sp1-cli -- prove install-toolchain
           cd crates/cli
-          cargo install --force --locked --path .
+          cargo install --locked --force --path .
           cargo clean
 
       - name: Run cargo check
@@ -85,7 +85,7 @@ jobs:
         run: |
           cargo run -p sp1-cli -- prove install-toolchain
           cd crates/cli
-          cargo install --force --locked --path .
+          cargo install --locked --force --path .
           cargo clean
 
       - name: Run cargo check
@@ -122,7 +122,7 @@ jobs:
         run: |
           cargo run -p sp1-cli -- prove install-toolchain
           cd crates/cli
-          cargo install --force --locked --path .
+          cargo install --locked --force --path .
           cargo clean
 
       - name: Run cargo fmt
@@ -165,7 +165,7 @@ jobs:
         run: |
           cargo run -p sp1-cli -- prove install-toolchain
           cd crates/cli
-          cargo install --force --locked --path .
+          cargo install --locked --force --path .
           cargo clean
 
       - name: Check workspace no features
@@ -209,7 +209,7 @@ jobs:
         run: |
           cargo run -p sp1-cli -- prove install-toolchain
           cd crates/cli
-          cargo install --force --locked --path .
+          cargo install --locked --force --path .
           cargo clean
 
       - name: Run cargo fmt
@@ -243,7 +243,7 @@ jobs:
         run: |
           cargo run -p sp1-cli -- prove install-toolchain
           cd crates/cli
-          cargo install --force --locked --path .
+          cargo install --locked --force --path .
           cargo clean
 
       - name: Run cargo prove new
@@ -277,7 +277,7 @@ jobs:
         run: |
           cargo run -p sp1-cli -- prove install-toolchain
           cd crates/cli
-          cargo install --force --locked --path .
+          cargo install --locked --force --path .
           cargo clean
 
       - name: Run Evaluation
@@ -324,7 +324,7 @@ jobs:
         run: |
           cargo run -p sp1-cli -- prove install-toolchain
           cd crates/cli
-          cargo install --force --locked --path .
+          cargo install --locked --force --path .
           cargo clean
 
       - name: Run tendermint script
@@ -369,7 +369,7 @@ jobs:
 
   #     - name: "Install cargo-prove"
   #       run: |
-  #         cargo install --force --locked --path ./crates/cli
+  #         cargo install --locked --force --path ./crates/cli
 
   #     - name: "Install SP1 toolchain"
   #       run: |
diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml
index f54af577f5..3841ae8adb 100644
--- a/.github/workflows/release-plz.yml
+++ b/.github/workflows/release-plz.yml
@@ -8,6 +8,7 @@ on:
   push:
     branches:
       - main
+  workflow_dispatch:
 
 jobs:
   release-plz:
diff --git a/.github/workflows/suite.yml b/.github/workflows/suite.yml
index 2a2b44a0d3..f4b2826390 100644
--- a/.github/workflows/suite.yml
+++ b/.github/workflows/suite.yml
@@ -59,6 +59,13 @@ jobs:
           command: clean
           toolchain: 1.81.0
 
+      - name: Install SP1 toolchain from repo
+        run: |
+          cargo run -p sp1-cli -- prove install-toolchain
+          cd crates/cli
+          cargo install --locked --force --path .
+          cargo clean
+
       - name: Run sp1-perf
         uses: actions-rs/cargo@v1
         with:
@@ -67,6 +74,7 @@ jobs:
           args: --release -p sp1-perf -- --program workdir/program.bin --stdin workdir/stdin.bin --mode cpu
         env:
           RUST_LOG: info
+          VERIFY_VK: false
           RUSTFLAGS: -Copt-level=3 -Ctarget-cpu=native
           RUST_BACKTRACE: 1
 
@@ -112,6 +120,13 @@ jobs:
           command: clean
           toolchain: 1.81.0
 
+      - name: Install SP1 toolchain from repo
+        run: |
+          cargo run -p sp1-cli -- prove install-toolchain
+          cd crates/cli
+          cargo install --locked --force --path .
+          cargo clean
+
       - name: Run sp1-perf
         uses: actions-rs/cargo@v1
         with:
@@ -120,6 +135,7 @@ jobs:
           args: --release -p sp1-perf -- --program workdir/program.bin --stdin workdir/stdin.bin --mode cuda
         env:
           RUST_LOG: debug
+          VERIFY_VK: false
           RUSTFLAGS: -Copt-level=3 -Ctarget-cpu=native
           RUST_BACKTRACE: 1
           SP1_PROVER: cuda
@@ -164,6 +180,13 @@ jobs:
           command: clean
           toolchain: 1.81.0
 
+      - name: Install SP1 toolchain from repo
+        run: |
+          cargo run -p sp1-cli -- prove install-toolchain
+          cd crates/cli
+          cargo install --locked --force --path .
+          cargo clean
+
       - name: Run sp1-perf
         uses: actions-rs/cargo@v1
         with:
@@ -172,6 +195,7 @@ jobs:
           args: --release -p sp1-perf --features "native-gnark,network-v2" -- --program workdir/program.bin --stdin workdir/stdin.bin --mode network
         env:
           RUST_LOG: info
+          VERIFY_VK: false
           RUSTFLAGS: -Copt-level=3 -Ctarget-cpu=native
           RUST_BACKTRACE: 1
           SP1_PROVER: network
diff --git a/.gitignore b/.gitignore
index a03b205b74..bb0a2397be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,4 +40,7 @@ crates/prover/trusted-setup
 crates/prover/vk
 
 # Example legacy elf
-examples/elf 
\ No newline at end of file
+examples/elf 
+# Example fibonacci groth16 / plonk proofs
+examples/fibonacci/fibonacci-groth16.bin
+examples/fibonacci/fibonacci-plonk.bin
diff --git a/.vscode/settings.json b/.vscode/settings.json
index d1500a8b1a..f3522916a0 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -66,4 +66,22 @@
     // "rust-analyzer.check.workspace": false,
     // "rust-analyzer.check.invocationStrategy": "once",
     // "rust-analyzer.cargo.buildScripts.invocationStrategy": "once",
+    "C_Cpp.default.includePath": [
+        "${workspaceFolder}/crates/**/include",
+        "${workspaceFolder}/target/include",
+    ],
+    "C_Cpp.intelliSenseEngine": "Tag Parser",
+    "files.associations": {
+        "random": "cpp",
+        "chrono": "cpp",
+        "cstdint": "cpp",
+        "ratio": "cpp",
+        "system_error": "cpp",
+        "array": "cpp",
+        "functional": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "cmath": "cpp"
+    },
 }
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 21b1074ae0..fdf7261393 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -105,9 +105,9 @@ dependencies = [
 
 [[package]]
 name = "allocator-api2"
-version = "0.2.20"
+version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
 
 [[package]]
 name = "alloy-consensus"
@@ -168,9 +168,9 @@ dependencies = [
 
 [[package]]
 name = "alloy-json-abi"
-version = "0.8.12"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b84c506bf264110fa7e90d9924f742f40ef53c6572ea56a0b0bd714a567ed389"
+checksum = "ac4b22b3e51cac09fd2adfcc73b55f447b4df669f983c13f7894ec82b607c63f"
 dependencies = [
  "alloy-primitives",
  "alloy-sol-type-parser",
@@ -228,19 +228,19 @@ dependencies = [
 
 [[package]]
 name = "alloy-primitives"
-version = "0.8.12"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fce5dbd6a4f118eecc4719eaa9c7ffc31c315e6c5ccde3642db927802312425"
+checksum = "9db948902dfbae96a73c2fbf1f7abec62af034ab883e4c777c3fd29702bd6e2c"
 dependencies = [
  "alloy-rlp",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "cfg-if",
  "const-hex",
  "derive_more",
  "foldhash",
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
  "hex-literal",
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
  "itoa",
  "k256",
  "keccak-asm",
@@ -248,7 +248,7 @@ dependencies = [
  "proptest",
  "rand 0.8.5",
  "ruint",
- "rustc-hash 2.0.0",
+ "rustc-hash 2.1.0",
  "serde",
  "sha3",
  "tiny-keccak",
@@ -262,7 +262,7 @@ checksum = "da0822426598f95e45dd1ea32a738dac057529a709ee645fcc516ffa4cbde08f"
 dependencies = [
  "alloy-rlp-derive",
  "arrayvec",
- "bytes 1.8.0",
+ "bytes 1.9.0",
 ]
 
 [[package]]
@@ -273,7 +273,7 @@ checksum = "2b09cae092c27b6f1bde952653a22708691802e57bfef4a2973b80bea21efd3f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -338,56 +338,56 @@ dependencies = [
 
 [[package]]
 name = "alloy-sol-macro"
-version = "0.8.12"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9343289b4a7461ed8bab8618504c995c049c082b70c7332efd7b32125633dc05"
+checksum = "3bfd7853b65a2b4f49629ec975fee274faf6dff15ab8894c620943398ef283c0"
 dependencies = [
  "alloy-sol-macro-expander",
  "alloy-sol-macro-input",
  "proc-macro-error2",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "alloy-sol-macro-expander"
-version = "0.8.12"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4222d70bec485ceccc5d8fd4f2909edd65b5d5e43d4aca0b5dcee65d519ae98f"
+checksum = "82ec42f342d9a9261699f8078e57a7a4fda8aaa73c1a212ed3987080e6a9cd13"
 dependencies = [
  "alloy-sol-macro-input",
  "const-hex",
- "heck",
- "indexmap 2.6.0",
+ "heck 0.5.0",
+ "indexmap 2.7.0",
  "proc-macro-error2",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "syn-solidity",
  "tiny-keccak",
 ]
 
 [[package]]
 name = "alloy-sol-macro-input"
-version = "0.8.12"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e17f2677369571b976e51ea1430eb41c3690d344fef567b840bfc0b01b6f83a"
+checksum = "ed2c50e6a62ee2b4f7ab3c6d0366e5770a21cad426e109c2f40335a1b3aff3df"
 dependencies = [
  "const-hex",
  "dunce",
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "syn-solidity",
 ]
 
 [[package]]
 name = "alloy-sol-type-parser"
-version = "0.8.12"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa64d80ae58ffaafdff9d5d84f58d03775f66c84433916dc9a64ed16af5755da"
+checksum = "ac17c6e89a50fb4a758012e4b409d9a0ba575228e69b539fe37d7a1bd507ca4a"
 dependencies = [
  "serde",
  "winnow 0.6.20",
@@ -395,9 +395,9 @@ dependencies = [
 
 [[package]]
 name = "alloy-sol-types"
-version = "0.8.12"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6520d427d4a8eb7aa803d852d7a52ceb0c519e784c292f64bb339e636918cf27"
+checksum = "c9dc0fffe397aa17628160e16b89f704098bf3c9d74d5d369ebc239575936de5"
 dependencies = [
  "alloy-json-abi",
  "alloy-primitives",
@@ -487,9 +487,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.93"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 dependencies = [
  "backtrace",
 ]
@@ -883,7 +883,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -913,7 +913,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -943,7 +943,7 @@ checksum = "3c87f3f15e7794432337fc718554eaa4dc8f04c9677a950ffe366f20a162ae42"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -954,15 +954,15 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 
 [[package]]
 name = "axum"
-version = "0.7.7"
+version = "0.7.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "504e3947307ac8326a5437504c517c4b56716c9d98fac0028c2acc7ca47d70ae"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
 dependencies = [
  "async-trait",
  "axum-core",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body",
  "http-body-util",
  "hyper",
@@ -978,7 +978,7 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "serde_urlencoded",
- "sync_wrapper 1.0.1",
+ "sync_wrapper 1.0.2",
  "tokio",
  "tower 0.5.1",
  "tower-layer",
@@ -993,15 +993,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
 dependencies = [
  "async-trait",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body",
  "http-body-util",
  "mime",
  "pin-project-lite",
  "rustversion",
- "sync_wrapper 1.0.1",
+ "sync_wrapper 1.0.2",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -1099,7 +1099,7 @@ dependencies = [
  "regex",
  "rustc-hash 1.1.0",
  "shlex",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1225,9 +1225,9 @@ checksum = "c3ac9f8b63eca6fd385229b3675f6cc0dc5c8a5c8a54a59d4f52ffd670d87b0c"
 
 [[package]]
 name = "bytemuck"
-version = "1.19.0"
+version = "1.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d"
+checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a"
 dependencies = [
  "bytemuck_derive",
 ]
@@ -1240,7 +1240,7 @@ checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1257,9 +1257,9 @@ checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38"
 
 [[package]]
 name = "bytes"
-version = "1.8.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da"
+checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
 dependencies = [
  "serde",
 ]
@@ -1290,9 +1290,9 @@ dependencies = [
 
 [[package]]
 name = "cargo-platform"
-version = "0.1.8"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc"
+checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea"
 dependencies = [
  "serde",
 ]
@@ -1317,11 +1317,30 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
+[[package]]
+name = "cbindgen"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fce8dd7fcfcbf3a0a87d8f515194b49d6135acab73e18bd380d1d93bb1a15eb"
+dependencies = [
+ "clap",
+ "heck 0.4.1",
+ "indexmap 2.7.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.90",
+ "tempfile",
+ "toml",
+]
+
 [[package]]
 name = "cc"
-version = "1.2.0"
+version = "1.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1aeb932158bd710538c73702db6945cb68a8fb08c519e6e12706b94263b36db8"
+checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc"
 dependencies = [
  "jobserver",
  "libc",
@@ -1410,9 +1429,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.20"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
+checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -1420,9 +1439,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.20"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
+checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
 dependencies = [
  "anstream",
  "anstyle",
@@ -1436,17 +1455,17 @@ version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "clap_lex"
-version = "0.7.2"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
 [[package]]
 name = "colorchoice"
@@ -1478,9 +1497,9 @@ dependencies = [
 
 [[package]]
 name = "const-hex"
-version = "1.13.1"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0121754e84117e65f9d90648ee6aa4882a6e63110307ab73967a4c5e7e69e586"
+checksum = "4b0485bab839b018a8f1723fc5391819fea5f8f0f32288ef8a735fd096b6160c"
 dependencies = [
  "cfg-if",
  "cpufeatures",
@@ -1534,6 +1553,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "core-foundation"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
 [[package]]
 name = "core-foundation-sys"
 version = "0.8.7"
@@ -1542,9 +1571,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.15"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca741a962e1b0bff6d724a1a0958b686406e853bb14061f218562e1896f95e6"
+checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3"
 dependencies = [
  "libc",
 ]
@@ -1734,32 +1763,6 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "curve25519-dalek"
-version = "4.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "curve25519-dalek-derive",
- "fiat-crypto",
- "rustc_version 0.4.1",
- "subtle",
- "zeroize",
-]
-
-[[package]]
-name = "curve25519-dalek-derive"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.87",
-]
-
 [[package]]
 name = "darling"
 version = "0.13.4"
@@ -1921,7 +1924,7 @@ checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "unicode-xid",
 ]
 
@@ -2002,7 +2005,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2102,7 +2105,7 @@ checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2113,12 +2116,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.9"
+version = "0.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -2140,9 +2143,9 @@ dependencies = [
 
 [[package]]
 name = "event-listener-strategy"
-version = "0.5.2"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1"
+checksum = "3c3e4e0dd3673c1139bf041f3008816d9cf2946bbfac2945c09e523b8d7b05b2"
 dependencies = [
  "event-listener 5.3.1",
  "pin-project-lite",
@@ -2181,7 +2184,7 @@ checksum = "139834ddba373bbdd213dffe02c8d110508dcf1726c2be27e8d1f7d7e1856418"
 dependencies = [
  "arrayvec",
  "auto_impl",
- "bytes 1.8.0",
+ "bytes 1.9.0",
 ]
 
 [[package]]
@@ -2224,12 +2227,6 @@ dependencies = [
  "syn 1.0.109",
 ]
 
-[[package]]
-name = "fiat-crypto"
-version = "0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
-
 [[package]]
 name = "fixed-hash"
 version = "0.8.0"
@@ -2385,7 +2382,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2443,9 +2440,9 @@ dependencies = [
 
 [[package]]
 name = "generic-array"
-version = "1.1.0"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96512db27971c2c3eece70a1e106fbe6c87760234e31e8f7e5634912fe52794a"
+checksum = "2cb8bc4c28d15ade99c7e90b219f30da4be5c88e586277e8cbe886beeb868ab2"
 dependencies = [
  "serde",
  "typenum",
@@ -2469,8 +2466,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
  "cfg-if",
+ "js-sys",
  "libc",
  "wasi 0.11.0+wasi-snapshot-preview1",
+ "wasm-bindgen",
 ]
 
 [[package]]
@@ -2566,17 +2565,17 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.4.6"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205"
+checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e"
 dependencies = [
  "atomic-waker",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "fnv",
  "futures-core",
  "futures-sink",
- "http 1.1.0",
- "indexmap 2.6.0",
+ "http 1.2.0",
+ "indexmap 2.7.0",
  "slab",
  "tokio",
  "tokio-util",
@@ -2644,9 +2643,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.15.1"
+version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
 dependencies = [
  "allocator-api2",
  "equivalent",
@@ -2654,6 +2653,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2722,18 +2727,18 @@ version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "fnv",
  "itoa",
 ]
 
 [[package]]
 name = "http"
-version = "1.1.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
+checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "fnv",
  "itoa",
 ]
@@ -2744,8 +2749,8 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
 dependencies = [
- "bytes 1.8.0",
- "http 1.1.0",
+ "bytes 1.9.0",
+ "http 1.2.0",
 ]
 
 [[package]]
@@ -2754,9 +2759,9 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body",
  "pin-project-lite",
 ]
@@ -2811,15 +2816,15 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 
 [[package]]
 name = "hyper"
-version = "1.5.0"
+version = "1.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a"
+checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "futures-channel",
  "futures-util",
  "h2",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body",
  "httparse",
  "httpdate",
@@ -2837,15 +2842,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333"
 dependencies = [
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "hyper",
  "hyper-util",
- "rustls 0.23.16",
+ "rustls 0.23.19",
  "rustls-pki-types",
  "tokio",
  "tokio-rustls",
  "tower-service",
- "webpki-roots 0.26.6",
+ "webpki-roots 0.26.7",
 ]
 
 [[package]]
@@ -2867,7 +2872,7 @@ version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "http-body-util",
  "hyper",
  "hyper-util",
@@ -2883,10 +2888,10 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "futures-channel",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body",
  "hyper",
  "pin-project-lite",
@@ -3034,7 +3039,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3075,13 +3080,13 @@ dependencies = [
 
 [[package]]
 name = "impl-trait-for-tuples"
-version = "0.2.2"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11d7a9f6330b71fea57921c9b61c47ee6e84f72d394754eff6163ae67e7395eb"
+checksum = "a0eb5a3343abf848c0984fe4604b2b105da9539376e24fc0a3b0007411ae4fd9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3102,12 +3107,12 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.6.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
+checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f"
 dependencies = [
  "equivalent",
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
  "serde",
 ]
 
@@ -3214,9 +3219,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.11"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
+checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
 
 [[package]]
 name = "jobserver"
@@ -3229,10 +3234,11 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.72"
+version = "0.3.74"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9"
+checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705"
 dependencies = [
+ "once_cell",
  "wasm-bindgen",
 ]
 
@@ -3303,9 +3309,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.162"
+version = "0.2.167"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398"
+checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
 
 [[package]]
 name = "libgit2-sys"
@@ -3321,9 +3327,9 @@ dependencies = [
 
 [[package]]
 name = "libloading"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
+checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
 dependencies = [
  "cfg-if",
  "windows-targets 0.52.6",
@@ -3375,9 +3381,9 @@ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "litemap"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
+checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
 
 [[package]]
 name = "lock_api"
@@ -3404,7 +3410,7 @@ version = "0.12.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
 dependencies = [
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
 ]
 
 [[package]]
@@ -3467,11 +3473,10 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "1.0.2"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
- "hermit-abi 0.3.9",
  "libc",
  "wasi 0.11.0+wasi-snapshot-preview1",
  "windows-sys 0.52.0",
@@ -3495,7 +3500,7 @@ dependencies = [
  "openssl-probe",
  "openssl-sys",
  "schannel",
- "security-framework",
+ "security-framework 2.11.1",
  "security-framework-sys",
  "tempfile",
 ]
@@ -3751,7 +3756,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3799,7 +3804,7 @@ dependencies = [
 [[package]]
 name = "p3-air"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -3808,7 +3813,7 @@ dependencies = [
 [[package]]
 name = "p3-baby-bear"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "num-bigint 0.4.6",
  "p3-field",
@@ -3822,7 +3827,7 @@ dependencies = [
 [[package]]
 name = "p3-bn254-fr"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "ff 0.13.0",
  "num-bigint 0.4.6",
@@ -3836,7 +3841,7 @@ dependencies = [
 [[package]]
 name = "p3-challenger"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-maybe-rayon",
@@ -3849,7 +3854,7 @@ dependencies = [
 [[package]]
 name = "p3-commit"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-challenger",
@@ -3862,7 +3867,7 @@ dependencies = [
 [[package]]
 name = "p3-dft"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -3874,7 +3879,7 @@ dependencies = [
 [[package]]
 name = "p3-field"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "num-bigint 0.4.6",
@@ -3887,7 +3892,7 @@ dependencies = [
 [[package]]
 name = "p3-fri"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-challenger",
@@ -3905,7 +3910,7 @@ dependencies = [
 [[package]]
 name = "p3-interpolation"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -3915,7 +3920,7 @@ dependencies = [
 [[package]]
 name = "p3-keccak-air"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-air",
  "p3-field",
@@ -3928,7 +3933,7 @@ dependencies = [
 [[package]]
 name = "p3-matrix"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-field",
@@ -3942,7 +3947,7 @@ dependencies = [
 [[package]]
 name = "p3-maybe-rayon"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "rayon",
 ]
@@ -3950,7 +3955,7 @@ dependencies = [
 [[package]]
 name = "p3-mds"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-dft",
@@ -3964,7 +3969,7 @@ dependencies = [
 [[package]]
 name = "p3-merkle-tree"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-commit",
@@ -3980,7 +3985,7 @@ dependencies = [
 [[package]]
 name = "p3-poseidon2"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "gcd",
  "p3-field",
@@ -3993,7 +3998,7 @@ dependencies = [
 [[package]]
 name = "p3-symmetric"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-field",
@@ -4003,7 +4008,7 @@ dependencies = [
 [[package]]
 name = "p3-uni-stark"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-air",
@@ -4021,7 +4026,7 @@ dependencies = [
 [[package]]
 name = "p3-util"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#d33eaa69b1ef96ad678ebd96ae8e75aef3508b2a"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "serde",
 ]
@@ -4126,6 +4131,12 @@ version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
+[[package]]
+name = "pathdiff"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
+
 [[package]]
 name = "pem-rfc7468"
 version = "0.7.0"
@@ -4159,7 +4170,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
  "fixedbitset",
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
 ]
 
 [[package]]
@@ -4179,7 +4190,7 @@ checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4283,9 +4294,9 @@ dependencies = [
 
 [[package]]
 name = "portable-atomic"
-version = "1.9.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2"
+checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
 
 [[package]]
 name = "powerfmt"
@@ -4309,7 +4320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033"
 dependencies = [
  "proc-macro2",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4384,7 +4395,7 @@ dependencies = [
  "proc-macro-error-attr2",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4395,9 +4406,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.89"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
@@ -4428,7 +4439,7 @@ version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "prost-derive",
 ]
 
@@ -4438,8 +4449,8 @@ version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
- "bytes 1.8.0",
- "heck",
+ "bytes 1.9.0",
+ "heck 0.5.0",
  "itertools 0.13.0",
  "log",
  "multimap",
@@ -4449,7 +4460,7 @@ dependencies = [
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.87",
+ "syn 2.0.90",
  "tempfile",
 ]
 
@@ -4463,7 +4474,7 @@ dependencies = [
  "itertools 0.13.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4483,37 +4494,40 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
 
 [[package]]
 name = "quinn"
-version = "0.11.5"
+version = "0.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684"
+checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "pin-project-lite",
  "quinn-proto",
  "quinn-udp",
- "rustc-hash 2.0.0",
- "rustls 0.23.16",
+ "rustc-hash 2.1.0",
+ "rustls 0.23.19",
  "socket2",
- "thiserror 1.0.69",
+ "thiserror 2.0.4",
  "tokio",
  "tracing",
 ]
 
 [[package]]
 name = "quinn-proto"
-version = "0.11.8"
+version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6"
+checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
+ "getrandom 0.2.15",
  "rand 0.8.5",
  "ring 0.17.8",
- "rustc-hash 2.0.0",
- "rustls 0.23.16",
+ "rustc-hash 2.1.0",
+ "rustls 0.23.19",
+ "rustls-pki-types",
  "slab",
- "thiserror 1.0.69",
+ "thiserror 2.0.4",
  "tinyvec",
  "tracing",
+ "web-time",
 ]
 
 [[package]]
@@ -4726,12 +4740,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f"
 dependencies = [
  "base64 0.22.1",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "encoding_rs",
  "futures-core",
  "futures-util",
  "h2",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body",
  "http-body-util",
  "hyper",
@@ -4747,13 +4761,13 @@ dependencies = [
  "percent-encoding",
  "pin-project-lite",
  "quinn",
- "rustls 0.23.16",
+ "rustls 0.23.19",
  "rustls-pemfile",
  "rustls-pki-types",
  "serde",
  "serde_json",
  "serde_urlencoded",
- "sync_wrapper 1.0.1",
+ "sync_wrapper 1.0.2",
  "system-configuration",
  "tokio",
  "tokio-native-tls",
@@ -4765,7 +4779,7 @@ dependencies = [
  "wasm-bindgen-futures",
  "wasm-streams",
  "web-sys",
- "webpki-roots 0.26.6",
+ "webpki-roots 0.26.7",
  "windows-registry",
 ]
 
@@ -4777,7 +4791,7 @@ checksum = "562ceb5a604d3f7c885a792d42c199fd8af239d0a51b2fa6a78aafa092452b04"
 dependencies = [
  "anyhow",
  "async-trait",
- "http 1.1.0",
+ "http 1.2.0",
  "reqwest",
  "serde",
  "thiserror 1.0.69",
@@ -4830,7 +4844,7 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bb919243f34364b6bd2fc10ef797edbfa75f33c252e7998527479c6d6b47e1ec"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "rustc-hex",
 ]
 
@@ -4866,7 +4880,7 @@ dependencies = [
  "alloy-rlp",
  "ark-ff 0.3.0",
  "ark-ff 0.4.2",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "fastrlp",
  "num-bigint 0.4.6",
  "num-traits",
@@ -4901,9 +4915,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
 [[package]]
 name = "rustc-hash"
-version = "2.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152"
+checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497"
 
 [[package]]
 name = "rustc-hex"
@@ -4940,9 +4954,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.40"
+version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99e4ea3e1cdc4b559b8e5650f9c8e5998e3e5c1343b4eaf034565f32318d63c0"
+checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
  "bitflags",
  "errno",
@@ -4966,9 +4980,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.23.16"
+version = "0.23.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
+checksum = "934b404430bb06b3fae2cba809eb45a1ab1aecd64491213d7c3301b88393f8d1"
 dependencies = [
  "log",
  "once_cell",
@@ -4981,15 +4995,14 @@ dependencies = [
 
 [[package]]
 name = "rustls-native-certs"
-version = "0.8.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a"
+checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3"
 dependencies = [
  "openssl-probe",
- "rustls-pemfile",
  "rustls-pki-types",
  "schannel",
- "security-framework",
+ "security-framework 3.0.1",
 ]
 
 [[package]]
@@ -5006,6 +5019,9 @@ name = "rustls-pki-types"
 version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
+dependencies = [
+ "web-time",
+]
 
 [[package]]
 name = "rustls-webpki"
@@ -5053,9 +5069,9 @@ dependencies = [
 
 [[package]]
 name = "scale-info"
-version = "2.11.5"
+version = "2.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1aa7ffc1c0ef49b0452c6e2986abf2b07743320641ffd5fc63d552458e3b779b"
+checksum = "346a3b32eba2640d17a9cb5927056b08f3de90f65b72fe09402c2ad07d684d0b"
 dependencies = [
  "cfg-if",
  "derive_more",
@@ -5065,30 +5081,30 @@ dependencies = [
 
 [[package]]
 name = "scale-info-derive"
-version = "2.11.5"
+version = "2.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46385cc24172cf615450267463f937c10072516359b3ff1cb24228a4a08bf951"
+checksum = "c6630024bf739e2179b91fb424b28898baf819414262c5d376677dbff1fe7ebf"
 dependencies = [
  "proc-macro-crate 3.2.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "scc"
-version = "2.2.4"
+version = "2.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8d25269dd3a12467afe2e510f69fb0b46b698e5afb296b59f2145259deaf8e8"
+checksum = "66b202022bb57c049555430e11fc22fea12909276a80a4c3d368da36ac1d88ed"
 dependencies = [
  "sdd",
 ]
 
 [[package]]
 name = "schannel"
-version = "0.1.26"
+version = "0.1.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1"
+checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d"
 dependencies = [
  "windows-sys 0.59.0",
 ]
@@ -5116,7 +5132,7 @@ checksum = "7f81c2fde025af7e69b1d1420531c8a8811ca898919db177141a85313b1cb932"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5156,7 +5172,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
 dependencies = [
  "bitflags",
- "core-foundation",
+ "core-foundation 0.9.4",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework"
+version = "3.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1415a607e92bec364ea2cf9264646dcce0f91e6d65281bd6f2819cca3bf39c8"
+dependencies = [
+ "bitflags",
+ "core-foundation 0.10.0",
  "core-foundation-sys",
  "libc",
  "security-framework-sys",
@@ -5187,7 +5216,7 @@ version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6"
 dependencies = [
- "semver-parser 0.10.2",
+ "semver-parser 0.10.3",
 ]
 
 [[package]]
@@ -5207,9 +5236,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
 
 [[package]]
 name = "semver-parser"
-version = "0.10.2"
+version = "0.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
+checksum = "9900206b54a3527fdc7b8a938bffd94a568bac4f4aa8113b209df75a09c0dec2"
 dependencies = [
  "pest",
 ]
@@ -5231,14 +5260,14 @@ checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.132"
+version = "1.0.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
+checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
 dependencies = [
  "itoa",
  "memchr",
@@ -5267,6 +5296,15 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "serde_spanned"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_urlencoded"
 version = "0.7.1"
@@ -5323,7 +5361,7 @@ checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5500,9 +5538,9 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.5.7"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
+checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -5510,7 +5548,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-build"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "anyhow",
  "cargo_metadata",
@@ -5521,7 +5559,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-cli"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "anstyle",
  "anyhow",
@@ -5550,7 +5588,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-core-executor"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "bytemuck",
@@ -5584,13 +5622,16 @@ dependencies = [
 
 [[package]]
 name = "sp1-core-machine"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
+ "cbindgen",
+ "cc",
  "cfg-if",
  "criterion",
  "elliptic-curve",
- "generic-array 1.1.0",
+ "generic-array 1.1.1",
+ "glob",
  "hashbrown 0.14.5",
  "hex",
  "itertools 0.13.0",
@@ -5608,7 +5649,10 @@ dependencies = [
  "p3-maybe-rayon",
  "p3-uni-stark",
  "p3-util",
+ "pathdiff",
  "rand 0.8.5",
+ "rayon",
+ "rayon-scan",
  "serde",
  "size",
  "snowbridge-amcl",
@@ -5627,14 +5671,14 @@ dependencies = [
  "tiny-keccak",
  "tracing",
  "tracing-forest",
- "tracing-subscriber 0.3.18",
+ "tracing-subscriber 0.3.19",
  "typenum",
  "web-time",
 ]
 
 [[package]]
 name = "sp1-cuda"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "ctrlc",
@@ -5652,13 +5696,13 @@ dependencies = [
 
 [[package]]
 name = "sp1-curves"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "cfg-if",
- "curve25519-dalek",
  "dashu",
+ "ecdsa",
  "elliptic-curve",
- "generic-array 1.1.0",
+ "generic-array 1.1.1",
  "itertools 0.13.0",
  "k256",
  "num",
@@ -5675,7 +5719,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-derive"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "quote",
  "syn 1.0.109",
@@ -5683,7 +5727,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-eval"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "anyhow",
  "bincode",
@@ -5695,20 +5739,22 @@ dependencies = [
  "sp1-prover",
  "sp1-sdk",
  "sp1-stark",
- "time 0.3.36",
+ "time 0.3.37",
  "tokio",
 ]
 
 [[package]]
 name = "sp1-helper"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "sp1-build",
 ]
 
 [[package]]
 name = "sp1-lib"
-version = "3.0.0"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a5729da1b05d56c01457e5ecabdc77f1cc941df23f2921163a2f325aec22428"
 dependencies = [
  "bincode",
  "serde",
@@ -5716,9 +5762,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-lib"
-version = "3.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c8744af050832df5ca44fcd63979a83b93ca3010b2d5a5ce2a2b91f7438065c"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "serde",
@@ -5726,7 +5770,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-perf"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "clap",
@@ -5737,12 +5781,12 @@ dependencies = [
  "sp1-sdk",
  "sp1-stark",
  "test-artifacts",
- "time 0.3.36",
+ "time 0.3.37",
 ]
 
 [[package]]
 name = "sp1-primitives"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "hex",
@@ -5758,7 +5802,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-prover"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "anyhow",
  "bincode",
@@ -5790,12 +5834,12 @@ dependencies = [
  "thiserror 1.0.69",
  "tracing",
  "tracing-appender",
- "tracing-subscriber 0.3.18",
+ "tracing-subscriber 0.3.19",
 ]
 
 [[package]]
 name = "sp1-recursion-circuit"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "ff 0.13.0",
  "hashbrown 0.14.5",
@@ -5832,7 +5876,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-compiler"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "backtrace",
  "criterion",
@@ -5857,12 +5901,16 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-core"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "backtrace",
+ "cbindgen",
+ "cc",
  "ff 0.13.0",
+ "glob",
  "hashbrown 0.14.5",
  "itertools 0.13.0",
+ "num_cpus",
  "p3-air",
  "p3-baby-bear",
  "p3-bn254-fr",
@@ -5877,6 +5925,7 @@ dependencies = [
  "p3-poseidon2",
  "p3-symmetric",
  "p3-util",
+ "pathdiff",
  "rand 0.8.5",
  "serde",
  "sp1-core-machine",
@@ -5892,7 +5941,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-derive"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "quote",
  "syn 1.0.109",
@@ -5900,7 +5949,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-gnark-cli"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "clap",
@@ -5909,7 +5958,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-gnark-ffi"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "anyhow",
  "bincode",
@@ -5933,7 +5982,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-sdk"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "alloy-primitives",
  "alloy-signer",
@@ -5979,11 +6028,12 @@ dependencies = [
 
 [[package]]
 name = "sp1-stark"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "arrayref",
  "hashbrown 0.14.5",
  "itertools 0.13.0",
+ "num-bigint 0.4.6",
  "num-traits",
  "p3-air",
  "p3-baby-bear",
@@ -6012,7 +6062,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-verifier"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "ark-bn254",
  "ark-ec",
@@ -6026,12 +6076,12 @@ dependencies = [
  "sha2 0.10.8",
  "sp1-sdk",
  "substrate-bn-succinct",
- "thiserror 2.0.3",
+ "thiserror-no-std",
 ]
 
 [[package]]
 name = "sp1-zkvm"
-version = "3.0.1"
+version = "4.0.0-rc.2"
 dependencies = [
  "cfg-if",
  "getrandom 0.2.15",
@@ -6041,7 +6091,7 @@ dependencies = [
  "p3-field",
  "rand 0.8.5",
  "sha2 0.10.8",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-primitives",
 ]
 
@@ -6173,11 +6223,11 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6194,7 +6244,7 @@ dependencies = [
  "num-bigint 0.4.6",
  "rand 0.8.5",
  "rustc-hex",
- "sp1-lib 3.2.1",
+ "sp1-lib 3.4.0",
 ]
 
 [[package]]
@@ -6239,9 +6289,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.87"
+version = "2.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
+checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6250,14 +6300,14 @@ dependencies = [
 
 [[package]]
 name = "syn-solidity"
-version = "0.8.12"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f76fe0a3e1476bdaa0775b9aec5b869ed9520c2b2fedfe9c6df3618f8ea6290b"
+checksum = "da0523f59468a2696391f2a772edc089342aacd53c3caa2ac3264e598edf119b"
 dependencies = [
  "paste",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6268,9 +6318,9 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
 [[package]]
 name = "sync_wrapper"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
 dependencies = [
  "futures-core",
 ]
@@ -6283,7 +6333,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6308,7 +6358,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
 dependencies = [
  "bitflags",
- "core-foundation",
+ "core-foundation 0.9.4",
  "system-configuration-sys",
 ]
 
@@ -6360,7 +6410,7 @@ dependencies = [
 
 [[package]]
 name = "test-artifacts"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "sp1-build",
 ]
@@ -6387,11 +6437,11 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "2.0.3"
+version = "2.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa"
+checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490"
 dependencies = [
- "thiserror-impl 2.0.3",
+ "thiserror-impl 2.0.4",
 ]
 
 [[package]]
@@ -6402,18 +6452,38 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "2.0.3"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
+[[package]]
+name = "thiserror-impl-no-std"
+version = "2.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568"
+checksum = "58e6318948b519ba6dc2b442a6d0b904ebfb8d411a3ad3e07843615a72249758"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "thiserror-no-std"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3ad459d94dd517257cc96add8a43190ee620011bb6e6cdc82dafd97dfafafea"
+dependencies = [
+ "thiserror-impl-no-std",
 ]
 
 [[package]]
@@ -6452,9 +6522,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21"
 dependencies = [
  "deranged",
  "itoa",
@@ -6464,7 +6534,7 @@ dependencies = [
  "powerfmt",
  "serde",
  "time-core",
- "time-macros 0.2.18",
+ "time-macros 0.2.19",
 ]
 
 [[package]]
@@ -6485,9 +6555,9 @@ dependencies = [
 
 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de"
 dependencies = [
  "num-conv",
  "time-core",
@@ -6552,12 +6622,12 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.41.1"
+version = "1.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33"
+checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551"
 dependencies = [
  "backtrace",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "libc",
  "mio",
  "parking_lot",
@@ -6576,7 +6646,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6595,7 +6665,7 @@ version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
 dependencies = [
- "rustls 0.23.16",
+ "rustls 0.23.19",
  "rustls-pki-types",
  "tokio",
 ]
@@ -6613,22 +6683,37 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.12"
+version = "0.7.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a"
+checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078"
 dependencies = [
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "futures-core",
  "futures-sink",
  "pin-project-lite",
  "tokio",
 ]
 
+[[package]]
+name = "toml"
+version = "0.8.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit 0.22.22",
+]
+
 [[package]]
 name = "toml_datetime"
 version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
+dependencies = [
+ "serde",
+]
 
 [[package]]
 name = "toml_edit"
@@ -6636,7 +6721,7 @@ version = "0.19.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
  "toml_datetime",
  "winnow 0.5.40",
 ]
@@ -6647,7 +6732,9 @@ version = "0.22.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
+ "serde",
+ "serde_spanned",
  "toml_datetime",
  "winnow 0.6.20",
 ]
@@ -6662,9 +6749,9 @@ dependencies = [
  "async-trait",
  "axum",
  "base64 0.22.1",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "h2",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body",
  "http-body-util",
  "hyper",
@@ -6735,9 +6822,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
  "log",
  "pin-project-lite",
@@ -6753,26 +6840,26 @@ checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
 dependencies = [
  "crossbeam-channel",
  "thiserror 1.0.69",
- "time 0.3.36",
- "tracing-subscriber 0.3.18",
+ "time 0.3.37",
+ "tracing-subscriber 0.3.19",
 ]
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6788,7 +6875,7 @@ dependencies = [
  "smallvec",
  "thiserror 1.0.69",
  "tracing",
- "tracing-subscriber 0.3.18",
+ "tracing-subscriber 0.3.19",
 ]
 
 [[package]]
@@ -6823,9 +6910,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.18"
+version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
  "matchers",
  "nu-ansi-term",
@@ -6853,7 +6940,7 @@ checksum = "30ee6ab729cd4cf0fd55218530c4522ed30b7b6081752839b68fcec8d0960788"
 dependencies = [
  "base64 0.13.1",
  "byteorder",
- "bytes 1.8.0",
+ "bytes 1.9.0",
  "http 0.2.12",
  "httparse",
  "log",
@@ -6882,7 +6969,7 @@ dependencies = [
  "async-trait",
  "axum",
  "futures",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body-util",
  "hyper",
  "prost",
@@ -6933,9 +7020,9 @@ checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.13"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "unicode-linebreak"
@@ -6985,9 +7072,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "url"
-version = "2.5.3"
+version = "2.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada"
+checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
 dependencies = [
  "form_urlencoded",
  "idna",
@@ -7056,7 +7143,7 @@ dependencies = [
  "cfg-if",
  "git2",
  "rustversion",
- "time 0.3.36",
+ "time 0.3.37",
 ]
 
 [[package]]
@@ -7113,9 +7200,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e"
+checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c"
 dependencies = [
  "cfg-if",
  "once_cell",
@@ -7124,36 +7211,37 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358"
+checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd"
 dependencies = [
  "bumpalo",
  "log",
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.45"
+version = "0.4.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b"
+checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d"
 dependencies = [
  "cfg-if",
  "js-sys",
+ "once_cell",
  "wasm-bindgen",
  "web-sys",
 ]
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56"
+checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -7161,22 +7249,22 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68"
+checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d"
+checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49"
 
 [[package]]
 name = "wasm-streams"
@@ -7193,9 +7281,9 @@ dependencies = [
 
 [[package]]
 name = "web-sys"
-version = "0.3.72"
+version = "0.3.74"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112"
+checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -7232,9 +7320,9 @@ dependencies = [
 
 [[package]]
 name = "webpki-roots"
-version = "0.26.6"
+version = "0.26.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958"
+checksum = "5d642ff16b7e79272ae451b7322067cdc17cadf68c23264be9d94a32319efe7e"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -7514,9 +7602,9 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
 
 [[package]]
 name = "yoke"
-version = "0.7.4"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
+checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
 dependencies = [
  "serde",
  "stable_deref_trait",
@@ -7526,13 +7614,13 @@ dependencies = [
 
 [[package]]
 name = "yoke-derive"
-version = "0.7.4"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
+checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "synstructure",
 ]
 
@@ -7554,27 +7642,27 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "zerofrom"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
+checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
 dependencies = [
  "zerofrom-derive",
 ]
 
 [[package]]
 name = "zerofrom-derive"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
+checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "synstructure",
 ]
 
@@ -7595,7 +7683,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -7617,7 +7705,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index f48c993410..3fc09809df 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace.package]
-version = "3.0.0"
+version = "4.0.0-rc.2"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 rust-version = "1.79"
@@ -30,6 +30,7 @@ members = [
   "crates/cuda",
   "crates/verifier",
   "crates/stark",
+  "crates/verifier",
   "crates/zkvm/*",
   "crates/test-artifacts",
 ]
@@ -49,32 +50,35 @@ debug-assertions = true
 
 [workspace.dependencies]
 # sp1
-sp1-build = { path = "crates/build", version = "3.0.0" }
-sp1-cli = { path = "crates/cli", version = "3.0.0", default-features = false }
-sp1-core-machine = { path = "crates/core/machine", version = "3.0.0" }
-sp1-core-executor = { path = "crates/core/executor", version = "3.0.0" }
-sp1-curves = { path = "crates/curves", version = "3.0.0" }
-sp1-derive = { path = "crates/derive", version = "3.0.0" }
-sp1-eval = { path = "crates/eval", version = "3.0.0" }
-sp1-helper = { path = "crates/helper", version = "3.0.0", default-features = false }
-sp1-primitives = { path = "crates/primitives", version = "3.0.0" }
-sp1-prover = { path = "crates/prover", version = "3.0.0" }
-sp1-recursion-compiler = { path = "crates/recursion/compiler", version = "3.0.0" }
-sp1-recursion-core = { path = "crates/recursion/core", version = "3.0.0", default-features = false }
-sp1-recursion-derive = { path = "crates/recursion/derive", version = "3.0.0", default-features = false }
-sp1-recursion-gnark-ffi = { path = "crates/recursion/gnark-ffi", version = "3.0.0", default-features = false }
-sp1-recursion-circuit = { path = "crates/recursion/circuit", version = "3.0.0", default-features = false }
-sp1-sdk = { path = "crates/sdk", version = "3.0.0" }
-sp1-cuda = { path = "crates/cuda", version = "3.0.0" }
-sp1-stark = { path = "crates/stark", version = "3.0.0" }
-sp1-lib = { path = "crates/zkvm/lib", version = "3.0.0", default-features = false }
+sp1-build = { path = "crates/build", version = "=4.0.0-rc.2" }
+sp1-cli = { path = "crates/cli", version = "=4.0.0-rc.2", default-features = false }
+sp1-core-machine = { path = "crates/core/machine", version = "=4.0.0-rc.2" }
+sp1-core-executor = { path = "crates/core/executor", version = "=4.0.0-rc.2" }
+sp1-curves = { path = "crates/curves", version = "=4.0.0-rc.2" }
+sp1-derive = { path = "crates/derive", version = "=4.0.0-rc.2" }
+sp1-eval = { path = "crates/eval", version = "=4.0.0-rc.2" }
+sp1-helper = { path = "crates/helper", version = "=4.0.0-rc.2", default-features = false }
+sp1-primitives = { path = "crates/primitives", version = "=4.0.0-rc.2" }
+sp1-prover = { path = "crates/prover", version = "=4.0.0-rc.2" }
+sp1-recursion-compiler = { path = "crates/recursion/compiler", version = "=4.0.0-rc.2" }
+sp1-recursion-core = { path = "crates/recursion/core", version = "=4.0.0-rc.2", default-features = false }
+sp1-recursion-derive = { path = "crates/recursion/derive", version = "=4.0.0-rc.2", default-features = false }
+sp1-recursion-gnark-ffi = { path = "crates/recursion/gnark-ffi", version = "=4.0.0-rc.2", default-features = false }
+sp1-recursion-circuit = { path = "crates/recursion/circuit", version = "=4.0.0-rc.2", default-features = false }
+sp1-sdk = { path = "crates/sdk", version = "=4.0.0-rc.2" }
+sp1-cuda = { path = "crates/cuda", version = "=4.0.0-rc.2" }
+sp1-stark = { path = "crates/stark", version = "=4.0.0-rc.2" }
+sp1-lib = { path = "crates/zkvm/lib", version = "=4.0.0-rc.2", default-features = false }
 
 # NOTE: The version in this crate is manually set to 3.0.1 right now. When upgrading SP1 versions,
 # make sure to update this crate.
-sp1-zkvm = { path = "crates/zkvm/entrypoint", version = "3.0.1", default-features = false }
+sp1-zkvm = { path = "crates/zkvm/entrypoint", version = "=4.0.0-rc.2", default-features = false }
+
+# For testing.
+test-artifacts = { path = "crates/test-artifacts", version = "=4.0.0-rc.2" }
 
 # For testing.
-test-artifacts = { path = "crates/test-artifacts", version = "3.0.0" }
+# test-artifacts = { path = "crates/test-artifacts", version = "3.0.0" }
 
 # p3
 # p3-air = "0.1.4-succinct"
diff --git a/book/developers/common-issues.md b/book/developers/common-issues.md
index 5b06ec3b43..a40b6d96ac 100644
--- a/book/developers/common-issues.md
+++ b/book/developers/common-issues.md
@@ -51,7 +51,7 @@ This is likely due to two different versions of `alloy_sol_types` being used. To
 
 ```toml
 [dependencies]
-sp1-sdk = { version = "3.0.0", default-features = false }
+sp1-sdk = { version = "4.0.0", default-features = false }
 ```
 
 This will configure out the `network` feature which will remove the dependency on `alloy_sol_types` and configure out the `NetworkProver`.
diff --git a/book/getting-started/install.md b/book/getting-started/install.md
index e58b9a624e..7c1a4ae85b 100644
--- a/book/getting-started/install.md
+++ b/book/getting-started/install.md
@@ -91,7 +91,7 @@ git clone git@github.com:succinctlabs/sp1.git
 cd sp1
 cd crates
 cd cli
-cargo install --locked --path .
+cargo install --locked --force --path .
 cd ~
 cargo prove build-toolchain
 ```
diff --git a/book/writing-programs/patched-crates.md b/book/writing-programs/patched-crates.md
index 0bca72416b..e49c707eab 100644
--- a/book/writing-programs/patched-crates.md
+++ b/book/writing-programs/patched-crates.md
@@ -36,14 +36,21 @@ sha3-v0-10-6 = { git = "https://github.com/sp1-patches/RustCrypto-hashes", packa
 sha3-v0-10-8 = { git = "https://github.com/sp1-patches/RustCrypto-hashes", package = "sha3", tag = "sha3-v0.10.8-patch-v1" }
 crypto-bigint = { git = "https://github.com/sp1-patches/RustCrypto-bigint", tag = "crypto_bigint-v0.5.5-patch-v1" }
 tiny-keccak = { git = "https://github.com/sp1-patches/tiny-keccak", tag = "tiny_keccak-v2.0.2-patch-v1" }
+substrate-bn = { git = "https://github.com/sp1-patches/bn", tag = "substrate_bn-v0.6.0-patch-v1" }
+bls12_381 = { git = "https://github.com/sp1-patches/bls12_381", tag = "bls12_381-v0.8.0-patch-v1" }
+
+# For sp1 versions >= 3.4.0
+curve25519-dalek = { git = "https://github.com/sp1-patches/curve25519-dalek", tag = "patch-v4.1.3-v3.4.0" }
+# For sp1 versions < 3.4.0
 curve25519-dalek = { git = "https://github.com/sp1-patches/curve25519-dalek", tag = "curve25519_dalek-v4.1.3-patch-v1" }
 curve25519-dalek-ng = { git = "https://github.com/sp1-patches/curve25519-dalek-ng", tag = "curve25519_dalek_ng-v4.1.1-patch-v1" }
 ed25519-consensus = { git = "https://github.com/sp1-patches/ed25519-consensus", tag = "ed25519_consensus-v2.1.0-patch-v1" }
+# For sp1 versions >= 3.3.0
+ecdsa-core = { git = "https://github.com/sp1-patches/signatures", package = "ecdsa", tag = "ecdsa-v0.16.9-patch-v3.3.0" }
+secp256k1 = { git = "https://github.com/sp1-patches/rust-secp256k1", tag = "secp256k1-v0.29.0-patch-v3.3.0" }
+# For sp1 versions < 3.3.0
 ecdsa-core = { git = "https://github.com/sp1-patches/signatures", package = "ecdsa", tag = "ecdsa-v0.16.9-patch-v1" }
 secp256k1 = { git = "https://github.com/sp1-patches/rust-secp256k1", tag = "secp256k1-v0.29.0-patch-v1" }
-substrate-bn = { git = "https://github.com/sp1-patches/bn", tag = "substrate_bn-v0.6.0-patch-v1" }
-bls12_381 = { git = "https://github.com/sp1-patches/bls12_381", tag = "bls12_381-v0.8.0-patch-v1" }
-
 ```
 
 If you are patching a crate from Github instead of from crates.io, you need to specify the
diff --git a/crates/cli/README.md b/crates/cli/README.md
index 30e8f31596..5bdd852918 100644
--- a/crates/cli/README.md
+++ b/crates/cli/README.md
@@ -21,7 +21,7 @@ cargo run --bin cargo-prove -- prove trace --elf <...> --trace <...>
 You can install the CLI locally from source by running the following command:
 
 ```bash
-cargo install --locked --path .
+cargo install --locked --force --path .
 ```
 
 ### Running the CLI after installing
diff --git a/crates/cli/src/commands/build_toolchain.rs b/crates/cli/src/commands/build_toolchain.rs
index 97f1aae009..0e13d56cb5 100644
--- a/crates/cli/src/commands/build_toolchain.rs
+++ b/crates/cli/src/commands/build_toolchain.rs
@@ -64,7 +64,8 @@ impl BuildToolchainCmd {
         std::fs::write(&config_file, config_toml)
             .with_context(|| format!("while writing configuration to {:?}", config_file))?;
 
-        // Work around target sanity check added in rust-lang/rust@09c076810cb7649e5817f316215010d49e78e8d7.
+        // Work around target sanity check added in
+        // rust-lang/rust@09c076810cb7649e5817f316215010d49e78e8d7.
         let temp_dir = std::env::temp_dir().join("rustc-targets");
         if !temp_dir.exists() {
             std::fs::create_dir_all(&temp_dir)?;
diff --git a/crates/core/executor/src/events/alu.rs b/crates/core/executor/src/events/alu.rs
index 2d2b14fe03..a42ce9ee82 100644
--- a/crates/core/executor/src/events/alu.rs
+++ b/crates/core/executor/src/events/alu.rs
@@ -9,6 +9,7 @@ use super::{create_random_lookup_ids, LookupId};
 /// This object encapsulated the information needed to prove an ALU operation. This includes its
 /// shard, opcode, operands, and other relevant information.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[repr(C)]
 pub struct AluEvent {
     /// The lookup identifier.
     pub lookup_id: LookupId,
diff --git a/crates/core/executor/src/events/memory.rs b/crates/core/executor/src/events/memory.rs
index 655e0fc21d..d0e07109fb 100644
--- a/crates/core/executor/src/events/memory.rs
+++ b/crates/core/executor/src/events/memory.rs
@@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
 /// This object encapsulates the information needed to prove a memory access operation. This
 /// includes the shard, timestamp, and value of the memory address.
 #[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
+#[repr(C)]
 pub struct MemoryRecord {
     /// The shard number.
     pub shard: u32,
@@ -39,6 +40,7 @@ pub enum MemoryAccessPosition {
 /// includes the value, shard, timestamp, and previous shard and timestamp.
 #[allow(clippy::manual_non_exhaustive)]
 #[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
+#[repr(C)]
 pub struct MemoryReadRecord {
     /// The value.
     pub value: u32,
@@ -58,6 +60,7 @@ pub struct MemoryReadRecord {
 /// includes the value, shard, timestamp, previous value, previous shard, and previous timestamp.
 #[allow(clippy::manual_non_exhaustive)]
 #[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
+#[repr(C)]
 pub struct MemoryWriteRecord {
     /// The value.
     pub value: u32,
@@ -126,7 +129,8 @@ impl MemoryRecordEnum {
 /// This object encapsulates the information needed to prove a memory initialize or finalize
 /// operation. This includes the address, value, shard, timestamp, and whether the memory is
 /// initialized or finalized.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[repr(C)]
 pub struct MemoryInitializeFinalizeEvent {
     /// The address.
     pub addr: u32,
@@ -223,7 +227,8 @@ impl From<MemoryWriteRecord> for MemoryRecordEnum {
 /// This object encapsulates the information needed to prove a memory access operation within a
 /// shard. This includes the address, initial memory access, and final memory access within a
 /// shard.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[repr(C)]
 pub struct MemoryLocalEvent {
     /// The address.
     pub addr: u32,
diff --git a/crates/core/executor/src/events/syscall.rs b/crates/core/executor/src/events/syscall.rs
index 23f9263ba8..09227c5a50 100644
--- a/crates/core/executor/src/events/syscall.rs
+++ b/crates/core/executor/src/events/syscall.rs
@@ -4,9 +4,10 @@ use super::LookupId;
 
 /// Syscall Event.
 ///
-/// This object encapsulated the information needed to prove a syscall invocation from the CPU table.
-/// This includes its shard, clk, syscall id, arguments, other relevant information.
+/// This object encapsulated the information needed to prove a syscall invocation from the CPU
+/// table. This includes its shard, clk, syscall id, arguments, other relevant information.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[repr(C)]
 pub struct SyscallEvent {
     /// The shard number.
     pub shard: u32,
diff --git a/crates/core/executor/src/events/utils.rs b/crates/core/executor/src/events/utils.rs
index ea2b630931..2c70f67c8d 100644
--- a/crates/core/executor/src/events/utils.rs
+++ b/crates/core/executor/src/events/utils.rs
@@ -1,10 +1,9 @@
-use serde::Deserialize;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use std::fmt::Display;
 
 /// A unique identifier for lookups.
 #[derive(Deserialize, Serialize, Debug, Clone, Copy, Default, Eq, Hash, PartialEq)]
-
+#[repr(C)]
 pub struct LookupId(pub u64);
 
 /// Create a random lookup id. This is slower than `record.create_lookup_id()` but is useful for
diff --git a/crates/core/executor/src/executor.rs b/crates/core/executor/src/executor.rs
index a9e0834a45..c2cc1915ab 100644
--- a/crates/core/executor/src/executor.rs
+++ b/crates/core/executor/src/executor.rs
@@ -6,7 +6,7 @@ use std::{
 
 use hashbrown::HashMap;
 use serde::{Deserialize, Serialize};
-use sp1_stark::SP1CoreOpts;
+use sp1_stark::{air::PublicValues, SP1CoreOpts};
 use thiserror::Error;
 
 use crate::{
@@ -79,8 +79,9 @@ pub struct Executor<'a> {
     /// checkpoints.
     pub memory_checkpoint: PagedMemory<Option<MemoryRecord>>,
 
-    /// Memory addresses that were initialized in this batch of shards. Used to minimize the size of
-    /// checkpoints. The value stored is whether or not it had a value at the beginning of the batch.
+    /// Memory addresses that were initialized in this batch of shards. Used to minimize the size
+    /// of checkpoints. The value stored is whether or not it had a value at the beginning of
+    /// the batch.
     pub uninitialized_memory_checkpoint: PagedMemory<bool>,
 
     /// The memory accesses for the current cycle.
@@ -1390,7 +1391,7 @@ impl<'a> Executor<'a> {
     pub fn execute_state(
         &mut self,
         emit_global_memory_events: bool,
-    ) -> Result<(ExecutionState, bool), ExecutionError> {
+    ) -> Result<(ExecutionState, PublicValues<u32, u32>, bool), ExecutionError> {
         self.memory_checkpoint.clear();
         self.executor_mode = ExecutorMode::Checkpoint;
         self.emit_global_memory_events = emit_global_memory_events;
@@ -1405,6 +1406,7 @@ impl<'a> Executor<'a> {
         let done = tracing::debug_span!("execute").in_scope(|| self.execute())?;
         // Create a checkpoint using `memory_checkpoint`. Just include all memory if `done` since we
         // need it all for MemoryFinalize.
+        let next_pc = self.state.pc;
         tracing::debug_span!("create memory checkpoint").in_scope(|| {
             let memory_checkpoint = std::mem::take(&mut self.memory_checkpoint);
             let uninitialized_memory_checkpoint =
@@ -1440,10 +1442,14 @@ impl<'a> Executor<'a> {
                     .collect();
             }
         });
+        let mut public_values = self.records.last().as_ref().unwrap().public_values;
+        public_values.start_pc = next_pc;
+        public_values.next_pc = next_pc;
+        println!("public values: {public_values:?}");
         if !done {
             self.records.clear();
         }
-        Ok((checkpoint, done))
+        Ok((checkpoint, public_values, done))
     }
 
     fn initialize(&mut self) {
diff --git a/crates/core/executor/src/hook.rs b/crates/core/executor/src/hook.rs
index e5479f623f..f5a6a3ebab 100644
--- a/crates/core/executor/src/hook.rs
+++ b/crates/core/executor/src/hook.rs
@@ -3,8 +3,11 @@ use core::fmt::Debug;
 use std::sync::{Arc, RwLock, RwLockWriteGuard};
 
 use hashbrown::HashMap;
-use sp1_curves::k256::{Invert, RecoveryId, Signature, VerifyingKey};
-use sp1_curves::p256::Signature as p256Signature;
+use sp1_curves::{
+    ecdsa::RecoveryId as ecdsaRecoveryId,
+    k256::{Invert, RecoveryId, Signature, VerifyingKey},
+    p256::{Invert as p256Invert, Signature as p256Signature, VerifyingKey as p256VerifyingKey},
+};
 
 use crate::Executor;
 
@@ -12,9 +15,11 @@ use crate::Executor;
 pub type BoxedHook<'a> = Arc<RwLock<dyn Hook + Send + Sync + 'a>>;
 
 /// The file descriptor through which to access `hook_k1_ecrecover`.
-pub const K1_ECRECOVER_HOOK: u32 = 5;
+pub const FD_K1_ECRECOVER_HOOK: u32 = 5;
 /// The file descriptor through which to access `hook_r1_ecrecover`.
-pub const R1_ECRECOVER_HOOK: u32 = 6;
+pub const FD_R1_ECRECOVER_HOOK: u32 = 6;
+/// The file descriptor through which to access `hook_ed_decompress`.
+pub const FD_EDDECOMPRESS: u32 = 8;
 
 /// A runtime hook. May be called during execution by writing to a specified file descriptor,
 /// accepting and returning arbitrary data.
@@ -78,8 +83,9 @@ impl<'a> Default for HookRegistry<'a> {
         let table = HashMap::from([
             // Note: To ensure any `fd` value is synced with `zkvm/precompiles/src/io.rs`,
             // add an assertion to the test `hook_fds_match` below.
-            (K1_ECRECOVER_HOOK, hookify(hook_k1_ecrecover)),
-            (R1_ECRECOVER_HOOK, hookify(hook_r1_ecrecover)),
+            (FD_K1_ECRECOVER_HOOK, hookify(hook_k1_ecrecover)),
+            (FD_R1_ECRECOVER_HOOK, hookify(hook_r1_ecrecover)),
+            (FD_EDDECOMPRESS, hookify(hook_ed_decompress)),
         ]);
 
         Self { table }
@@ -115,55 +121,106 @@ pub struct HookEnv<'a, 'b: 'a> {
 ///       recovery ID.
 ///     - The message hash is 32 bytes.
 ///
-/// The result is returned as a pair of bytes, where the first 32 bytes are the X coordinate
+/// The result is returned as a status and a pair of bytes, where the first 32 bytes are the X coordinate
 /// and the second 32 bytes are the Y coordinate of the decompressed point.
 ///
+/// A status of 0 indicates that the public key could not be recovered.
+///
 /// WARNING: This function is used to recover the public key outside of the zkVM context. These
 /// values must be constrained by the zkVM for correctness.
 #[must_use]
 pub fn hook_k1_ecrecover(_: HookEnv, buf: &[u8]) -> Vec<Vec<u8>> {
-    assert_eq!(buf.len(), 65 + 32, "ecrecover input should have length 65 + 32");
+    assert_eq!(buf.len(), 65 + 32, "ecrecover input should have length 65 + 32, this is a bug.");
     let (sig, msg_hash) = buf.split_at(65);
     let sig: &[u8; 65] = sig.try_into().unwrap();
     let msg_hash: &[u8; 32] = msg_hash.try_into().unwrap();
 
-    let mut recovery_id = sig[64];
-    let mut sig = Signature::from_slice(&sig[..64]).unwrap();
+    let recovery_id = sig[64];
+    let sig = Signature::from_slice(&sig[..64]).unwrap();
+
+    let recid = RecoveryId::from_byte(recovery_id)
+        .expect("Computed recovery ID is invalid, this is a bug.");
 
-    if let Some(sig_normalized) = sig.normalize_s() {
-        sig = sig_normalized;
-        recovery_id ^= 1;
+    // Attempting to recvover the public key has failed, write a 0 to indicate to the caller.
+    let Ok(recovered_key) = VerifyingKey::recover_from_prehash(&msg_hash[..], &sig, recid) else {
+        return vec![vec![0]];
     };
-    let recid = RecoveryId::from_byte(recovery_id).expect("Computed recovery ID is invalid!");
 
-    let recovered_key = VerifyingKey::recover_from_prehash(&msg_hash[..], &sig, recid).unwrap();
     let bytes = recovered_key.to_sec1_bytes();
 
     let (_, s) = sig.split_scalars();
     let s_inverse = s.invert();
 
-    vec![bytes.to_vec(), s_inverse.to_bytes().to_vec()]
+    vec![vec![1], bytes.to_vec(), s_inverse.to_bytes().to_vec()]
 }
 
-/// Recovers s inverse from the signature using the secp256r1 crate.
+/// Checks if a compressed Edwards point can be decompressed.
+///
+/// # Arguments
+/// * `env` - The environment in which the hook is invoked.
+/// * `buf` - The buffer containing the compressed Edwards point.
+///    - The compressed Edwards point is 32 bytes.
+///    - The high bit of the last byte is the sign bit.
+///
+/// The result is either `0` if the point cannot be decompressed, or `1` if it can.
+///
+/// WARNING: This function merely hints at the validity of the compressed point. These values must
+/// be constrained by the zkVM for correctness.
+#[must_use]
+pub fn hook_ed_decompress(_: HookEnv, buf: &[u8]) -> Vec<Vec<u8>> {
+    let Ok(point) = sp1_curves::curve25519_dalek::CompressedEdwardsY::from_slice(buf) else {
+        return vec![vec![0]];
+    };
+
+    if sp1_curves::edwards::ed25519::decompress(&point).is_some() {
+        vec![vec![1]]
+    } else {
+        vec![vec![0]]
+    }
+}
+
+/// Recovers the public key from the signature and message hash using the p256 crate.
 ///
 /// # Arguments
 ///
 /// * `env` - The environment in which the hook is invoked.
-/// * `buf` - The buffer containing the signature.
-///     - The signature is 64 bytes.
+/// * `buf` - The buffer containing the signature and message hash.
+///     - The signature is 65 bytes, the first 64 bytes are the signature and the last byte is the
+///       recovery ID.
+///     - The message hash is 32 bytes.
+///
+/// The result is returned as a status and a pair of bytes, where the first 32 bytes are the X coordinate
+/// and the second 32 bytes are the Y coordinate of the decompressed point.
 ///
-/// The result is a single 32 byte vector containing s inverse.
+/// A status of 0 indicates that the public key could not be recovered.
+///
+/// WARNING: This function is used to recover the public key outside of the zkVM context. These
+/// values must be constrained by the zkVM for correctness.
 #[must_use]
 pub fn hook_r1_ecrecover(_: HookEnv, buf: &[u8]) -> Vec<Vec<u8>> {
-    assert_eq!(buf.len(), 64, "ecrecover input should have length 64");
-    let sig: &[u8; 64] = buf.try_into().unwrap();
-    let sig = p256Signature::from_slice(sig).unwrap();
+    assert_eq!(buf.len(), 65 + 32, "ecrecover input should have length 65 + 32, this is a bug.");
+    let (sig, msg_hash) = buf.split_at(65);
+    let sig: &[u8; 65] = sig.try_into().unwrap();
+    let msg_hash: &[u8; 32] = msg_hash.try_into().unwrap();
+
+    let recovery_id = sig[64];
+    let sig = p256Signature::from_slice(&sig[..64]).unwrap();
+    let recid = ecdsaRecoveryId::from_byte(recovery_id)
+        .expect("Computed recovery ID is invalid, this is a bug.");
+
+    // Attempting to recover the public key has failed, write a 0 to indicate to the caller.
+    let Ok(recovered_key) = p256VerifyingKey::recover_from_prehash(&msg_hash[..], &sig, recid)
+    else {
+        return vec![vec![0]];
+    };
+
+    let recovered_key_encoded = recovered_key.to_encoded_point(true);
+    let bytes = recovered_key_encoded.as_bytes();
 
     let (_, s) = sig.split_scalars();
     let s_inverse = s.invert();
 
-    vec![s_inverse.to_bytes().to_vec()]
+    vec![vec![1], bytes.to_vec(), s_inverse.to_bytes().to_vec()]
 }
 
 #[cfg(test)]
@@ -173,8 +230,8 @@ pub mod tests {
     #[test]
     pub fn hook_fds_match() {
         use sp1_zkvm::lib::io;
-        assert_eq!(K1_ECRECOVER_HOOK, io::K1_ECRECOVER_HOOK);
-        assert_eq!(R1_ECRECOVER_HOOK, io::R1_ECRECOVER_HOOK);
+        assert_eq!(FD_K1_ECRECOVER_HOOK, io::FD_K1_ECRECOVER_HOOK);
+        assert_eq!(FD_R1_ECRECOVER_HOOK, io::FD_R1_ECRECOVER_HOOK);
     }
 
     #[test]
diff --git a/crates/core/executor/src/instruction.rs b/crates/core/executor/src/instruction.rs
index 10dfa5476d..cbea85daaf 100644
--- a/crates/core/executor/src/instruction.rs
+++ b/crates/core/executor/src/instruction.rs
@@ -11,6 +11,7 @@ use crate::opcode::Opcode;
 /// as 32-bit words, but instead use a custom encoding that is more friendly to decode in the
 /// SP1 zkVM.
 #[derive(Clone, Copy, Serialize, Deserialize)]
+#[repr(C)]
 pub struct Instruction {
     /// The operation to execute.
     pub opcode: Opcode,
diff --git a/crates/core/executor/src/memory.rs b/crates/core/executor/src/memory.rs
index a036bbf5ca..bdaf468a6f 100644
--- a/crates/core/executor/src/memory.rs
+++ b/crates/core/executor/src/memory.rs
@@ -198,7 +198,8 @@ impl<'a, V: Copy> Entry<'a, V> {
         }
     }
 
-    /// Provides in-place mutable access to an occupied entry before any potential inserts into the map.
+    /// Provides in-place mutable access to an occupied entry before any potential inserts into the
+    /// map.
     pub fn and_modify<F: FnOnce(&mut V)>(mut self, f: F) -> Self {
         match &mut self {
             Entry::Vacant(_) => {}
diff --git a/crates/core/executor/src/opcode.rs b/crates/core/executor/src/opcode.rs
index 818b5b1f2b..b8dd250e95 100644
--- a/crates/core/executor/src/opcode.rs
+++ b/crates/core/executor/src/opcode.rs
@@ -24,6 +24,7 @@ use serde::{Deserialize, Serialize};
 #[derive(
     Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord, Enum,
 )]
+#[repr(u8)]
 pub enum Opcode {
     /// rd ← rs1 + rs2, pc ← pc + 4
     ADD = 0,
diff --git a/crates/core/executor/src/program.rs b/crates/core/executor/src/program.rs
index 09bb70cac4..8dcf4ef715 100644
--- a/crates/core/executor/src/program.rs
+++ b/crates/core/executor/src/program.rs
@@ -2,17 +2,23 @@
 
 use std::{fs::File, io::Read};
 
-use hashbrown::HashMap;
-use p3_field::Field;
-use serde::{Deserialize, Serialize};
-use sp1_stark::air::{MachineAir, MachineProgram};
-
 use crate::{
     disassembler::{transpile, Elf},
     instruction::Instruction,
     CoreShape,
 };
-
+use hashbrown::HashMap;
+use p3_field::AbstractExtensionField;
+use p3_field::Field;
+use p3_field::PrimeField;
+use p3_maybe_rayon::prelude::IntoParallelIterator;
+use p3_maybe_rayon::prelude::{ParallelBridge, ParallelIterator};
+use serde::{Deserialize, Serialize};
+use sp1_stark::air::{MachineAir, MachineProgram};
+use sp1_stark::septic_curve::{SepticCurve, SepticCurveComplete};
+use sp1_stark::septic_digest::SepticDigest;
+use sp1_stark::septic_extension::SepticExtension;
+use sp1_stark::InteractionKind;
 /// A program that can be executed by the SP1 zkVM.
 ///
 /// Contains a series of instructions along with the initial memory image. It also contains the
@@ -98,8 +104,35 @@ impl Program {
     }
 }
 
-impl<F: Field> MachineProgram<F> for Program {
+impl<F: PrimeField> MachineProgram<F> for Program {
     fn pc_start(&self) -> F {
         F::from_canonical_u32(self.pc_start)
     }
+
+    fn initial_global_cumulative_sum(&self) -> SepticDigest<F> {
+        let mut digests: Vec<SepticCurveComplete<F>> = self
+            .memory_image
+            .iter()
+            .par_bridge()
+            .map(|(&addr, &word)| {
+                let values = [
+                    (InteractionKind::Memory as u32) << 24,
+                    0,
+                    addr,
+                    word & 255,
+                    (word >> 8) & 255,
+                    (word >> 16) & 255,
+                    (word >> 24) & 255,
+                ];
+                let x_start =
+                    SepticExtension::<F>::from_base_fn(|i| F::from_canonical_u32(values[i]));
+                let (point, _) = SepticCurve::<F>::lift_x(x_start);
+                SepticCurveComplete::Affine(point.neg())
+            })
+            .collect();
+        digests.push(SepticCurveComplete::Affine(SepticDigest::<F>::zero().0));
+        SepticDigest(
+            digests.into_par_iter().reduce(|| SepticCurveComplete::Infinity, |a, b| a + b).point(),
+        )
+    }
 }
diff --git a/crates/core/executor/src/record.rs b/crates/core/executor/src/record.rs
index f9e89acb4c..fbf79cf01a 100644
--- a/crates/core/executor/src/record.rs
+++ b/crates/core/executor/src/record.rs
@@ -178,7 +178,15 @@ impl ExecutionRecord {
 
     /// Splits the deferred [`ExecutionRecord`] into multiple [`ExecutionRecord`]s, each which
     /// contain a "reasonable" number of deferred events.
-    pub fn split(&mut self, last: bool, opts: SplitOpts) -> Vec<ExecutionRecord> {
+    ///
+    /// The optional `last_record` will be provided if there are few enough deferred events that
+    /// they can all be packed into the already existing last record.
+    pub fn split(
+        &mut self,
+        last: bool,
+        last_record: Option<&mut ExecutionRecord>,
+        opts: SplitOpts,
+    ) -> Vec<ExecutionRecord> {
         let mut shards = Vec::new();
 
         let precompile_events = take(&mut self.precompile_events);
@@ -216,6 +224,18 @@ impl ExecutionRecord {
             self.global_memory_initialize_events.sort_by_key(|event| event.addr);
             self.global_memory_finalize_events.sort_by_key(|event| event.addr);
 
+            // If there are no precompile shards, and `last_record` is Some, pack the memory events
+            // into the last record.
+            let pack_memory_events_into_last_record = last_record.is_some() && shards.is_empty();
+            let mut blank_record = ExecutionRecord::new(self.program.clone());
+
+            // If `last_record` is None, use a blank record to store the memory events.
+            let last_record_ref = if pack_memory_events_into_last_record {
+                last_record.unwrap()
+            } else {
+                &mut blank_record
+            };
+
             let mut init_addr_bits = [0; 32];
             let mut finalize_addr_bits = [0; 32];
             for mem_chunks in self
@@ -230,28 +250,34 @@ impl ExecutionRecord {
                     EitherOrBoth::Left(mem_init_chunk) => (mem_init_chunk, [].as_slice()),
                     EitherOrBoth::Right(mem_finalize_chunk) => ([].as_slice(), mem_finalize_chunk),
                 };
-                let mut shard = ExecutionRecord::new(self.program.clone());
-                shard.global_memory_initialize_events.extend_from_slice(mem_init_chunk);
-                shard.public_values.previous_init_addr_bits = init_addr_bits;
+                last_record_ref.global_memory_initialize_events.extend_from_slice(mem_init_chunk);
+                last_record_ref.public_values.previous_init_addr_bits = init_addr_bits;
                 if let Some(last_event) = mem_init_chunk.last() {
                     let last_init_addr_bits = core::array::from_fn(|i| (last_event.addr >> i) & 1);
                     init_addr_bits = last_init_addr_bits;
                 }
-                shard.public_values.last_init_addr_bits = init_addr_bits;
+                last_record_ref.public_values.last_init_addr_bits = init_addr_bits;
 
-                shard.global_memory_finalize_events.extend_from_slice(mem_finalize_chunk);
-                shard.public_values.previous_finalize_addr_bits = finalize_addr_bits;
+                last_record_ref.global_memory_finalize_events.extend_from_slice(mem_finalize_chunk);
+                last_record_ref.public_values.previous_finalize_addr_bits = finalize_addr_bits;
                 if let Some(last_event) = mem_finalize_chunk.last() {
                     let last_finalize_addr_bits =
                         core::array::from_fn(|i| (last_event.addr >> i) & 1);
                     finalize_addr_bits = last_finalize_addr_bits;
                 }
-                shard.public_values.last_finalize_addr_bits = finalize_addr_bits;
+                last_record_ref.public_values.last_finalize_addr_bits = finalize_addr_bits;
+
+                if !pack_memory_events_into_last_record {
+                    // If not packing memory events into the last record, add 'last_record_ref'
+                    // to the returned records. `take` replaces `blank_program` with the default.
+                    shards.push(take(last_record_ref));
 
-                shards.push(shard);
+                    // Reset the last record so its program is the correct one. (The default program
+                    // provided by `take` contains no instructions.)
+                    last_record_ref.program = self.program.clone();
+                }
             }
         }
-
         shards
     }
 
diff --git a/crates/core/executor/src/state.rs b/crates/core/executor/src/state.rs
index 4c669188df..55ba22e321 100644
--- a/crates/core/executor/src/state.rs
+++ b/crates/core/executor/src/state.rs
@@ -29,7 +29,8 @@ pub struct ExecutionState {
     /// + timestamp that each memory address was accessed.
     pub memory: PagedMemory<MemoryRecord>,
 
-    /// The global clock keeps track of how many instructions have been executed through all shards.
+    /// The global clock keeps track of how many instructions have been executed through all
+    /// shards.
     pub global_clk: u64,
 
     /// The clock increments by 4 (possibly more in syscalls) for each instruction that has been
diff --git a/crates/core/executor/src/syscalls/code.rs b/crates/core/executor/src/syscalls/code.rs
index 542fda0d04..59fe5c7de8 100644
--- a/crates/core/executor/src/syscalls/code.rs
+++ b/crates/core/executor/src/syscalls/code.rs
@@ -23,6 +23,7 @@ use strum_macros::EnumIter;
 )]
 #[allow(non_camel_case_types)]
 #[allow(clippy::upper_case_acronyms)]
+#[repr(u32)]
 pub enum SyscallCode {
     /// Halts the program.
     HALT = 0x00_00_00_00,
diff --git a/crates/core/executor/src/syscalls/context.rs b/crates/core/executor/src/syscalls/context.rs
index 74dfafb279..64f41dc7ab 100644
--- a/crates/core/executor/src/syscalls/context.rs
+++ b/crates/core/executor/src/syscalls/context.rs
@@ -110,9 +110,9 @@ impl<'a, 'b> SyscallContext<'a, 'b> {
         let mut syscall_local_mem_events = Vec::new();
 
         if !self.rt.unconstrained && self.rt.executor_mode == ExecutorMode::Trace {
-            // Will need to transfer the existing memory local events in the executor to it's record,
-            // and return all the syscall memory local events.  This is similar to what
-            // `bump_record` does.
+            // Will need to transfer the existing memory local events in the executor to it's
+            // record, and return all the syscall memory local events.  This is similar
+            // to what `bump_record` does.
             for (addr, event) in self.local_memory_access.drain() {
                 let local_mem_access = self.rt.local_memory_access.remove(&addr);
 
diff --git a/crates/core/executor/src/syscalls/precompiles/edwards/decompress.rs b/crates/core/executor/src/syscalls/precompiles/edwards/decompress.rs
index 6e790ab133..12276aa181 100644
--- a/crates/core/executor/src/syscalls/precompiles/edwards/decompress.rs
+++ b/crates/core/executor/src/syscalls/precompiles/edwards/decompress.rs
@@ -53,7 +53,8 @@ impl<E: EdwardsParameters> Syscall for EdwardsDecompressSyscall<E> {
 
         // Compute actual decompressed X
         let compressed_y = CompressedEdwardsY(compressed_edwards_y);
-        let decompressed = decompress(&compressed_y);
+        let decompressed =
+            decompress(&compressed_y).expect("Decompression failed, syscall invariant violated.");
 
         let mut decompressed_x_bytes = decompressed.x.to_bytes_le();
         decompressed_x_bytes.resize(32, 0u8);
diff --git a/crates/core/executor/src/syscalls/precompiles/fptower/fp2_addsub.rs b/crates/core/executor/src/syscalls/precompiles/fptower/fp2_addsub.rs
index f583432310..b433f6384a 100644
--- a/crates/core/executor/src/syscalls/precompiles/fptower/fp2_addsub.rs
+++ b/crates/core/executor/src/syscalls/precompiles/fptower/fp2_addsub.rs
@@ -86,8 +86,8 @@ impl<P: FpOpField> Syscall for Fp2AddSubSyscall<P> {
             local_mem_access: rt.postprocess(),
         };
         match P::FIELD_TYPE {
-            // All the fp2 add and sub events for a given curve are coalesced to the curve's fp2 add operation.  Only check for
-            // that operation.
+            // All the fp2 add and sub events for a given curve are coalesced to the curve's fp2 add
+            // operation.  Only check for that operation.
             // TODO:  Fix this.
             FieldType::Bn254 => {
                 let syscall_code_key = match syscall_code {
diff --git a/crates/core/machine/Cargo.toml b/crates/core/machine/Cargo.toml
index 0128e67ee5..ff60ab4838 100644
--- a/crates/core/machine/Cargo.toml
+++ b/crates/core/machine/Cargo.toml
@@ -8,6 +8,7 @@ license = { workspace = true }
 repository = { workspace = true }
 keywords = { workspace = true }
 categories = { workspace = true }
+links = "sp1-core-machine-sys"
 
 [dependencies]
 bincode = "1.3.3"
@@ -27,6 +28,9 @@ p3-util = { workspace = true }
 sp1-derive = { workspace = true }
 sp1-primitives = { workspace = true }
 
+rayon = "1.10.0"
+rayon-scan = "0.1.1"
+
 amcl = { package = "snowbridge-amcl", version = "1.0.2", default-features = false, features = [
   "bls381",
 ] }
@@ -65,9 +69,21 @@ sp1-zkvm = { workspace = true }
 sp1-core-executor = { workspace = true, features = ["programs"] }
 test-artifacts = { workspace = true }
 
+[build-dependencies]
+sp1-stark = { workspace = true }
+sp1-primitives = { workspace = true }
+p3-baby-bear = { workspace = true }
+cbindgen = "0.27.0"
+cc = "1.1"
+pathdiff = "0.2.1"
+glob = "0.3.1"
+
 [features]
+default = []
+programs = []
 debug = []
 bigint-rug = ["sp1-curves/bigint-rug"]
+sys = []
 
 [lib]
 bench = false
diff --git a/crates/core/machine/build.rs b/crates/core/machine/build.rs
new file mode 100644
index 0000000000..f2088bd6ac
--- /dev/null
+++ b/crates/core/machine/build.rs
@@ -0,0 +1,169 @@
+fn main() {
+    #[cfg(feature = "sys")]
+    sys::build_ffi();
+}
+
+#[cfg(feature = "sys")]
+mod sys {
+    use std::{
+        env, fs, os,
+        path::{Path, PathBuf},
+    };
+
+    use pathdiff::diff_paths;
+
+    /// The library name, used for the static library archive and the headers.
+    /// Should be chosen as to not conflict with other library/header names.
+    const LIB_NAME: &str = "sp1-core-machine-sys";
+
+    /// The name of all include directories involved, used to find and output header files.
+    const INCLUDE_DIRNAME: &str = "include";
+
+    /// The name of the directory to recursively search for source files in.
+    const SOURCE_DIRNAME: &str = "cpp";
+
+    /// The warning placed in the cbindgen header.
+    const AUTOGEN_WARNING: &str =
+        "/* Automatically generated by `cbindgen`. Not intended for manual editing. */";
+
+    pub fn build_ffi() {
+        // The name of the header generated by `cbindgen`.
+        let cbindgen_hpp = &format!("{LIB_NAME}-cbindgen.hpp");
+
+        // The crate directory.
+        let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
+
+        // The output directory, where built artifacts should be placed.
+        let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+
+        // The target directory that the cargo invocation is using.
+        // Headers are symlinked into `target/include` purely for IDE purposes.
+        let target_dir = {
+            let mut dir = out_dir.clone();
+            loop {
+                if dir.ends_with("target") {
+                    break dir;
+                }
+                if !dir.pop() {
+                    panic!("OUT_DIR does not have parent called \"target\": {:?}", out_dir);
+                }
+            }
+        };
+
+        // The directory to read headers from.
+        let source_include_dir = crate_dir.join(INCLUDE_DIRNAME);
+
+        // The directory to place headers into.
+        let target_include_dir = out_dir.join(INCLUDE_DIRNAME);
+
+        // The directory to place symlinks to headers into. Has the fixed path "target/include".
+        let target_include_dir_fixed = target_dir.join(INCLUDE_DIRNAME);
+
+        // The directory to read source files from.
+        let source_dir = crate_dir.join(SOURCE_DIRNAME);
+
+        let headers = glob::glob(source_include_dir.join("**/*.hpp").to_str().unwrap())
+            .unwrap()
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        let compilation_units = glob::glob(source_dir.join("**/*.cpp").to_str().unwrap())
+            .unwrap()
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        // Tell Cargo that if the given file changes, to rerun this build script.
+        println!("cargo::rerun-if-changed={INCLUDE_DIRNAME}");
+        println!("cargo::rerun-if-changed={SOURCE_DIRNAME}");
+        println!("cargo::rerun-if-changed=src");
+        println!("cargo::rerun-if-changed=Cargo.toml");
+
+        // Cargo build script metadata, used by dependents' build scripts.
+        // The root directory containing the library archive.
+        println!("cargo::metadata=root={}", out_dir.to_str().unwrap());
+
+        // The include path defining the library's API.
+        println!("cargo::metadata=include={}", target_include_dir.to_str().unwrap());
+
+        // Generate a header containing bindings to the crate.
+        match cbindgen::Builder::new()
+            .with_pragma_once(true)
+            .with_autogen_warning(AUTOGEN_WARNING)
+            .with_no_includes()
+            .with_sys_include("cstdint")
+            .with_parse_deps(true)
+            .with_parse_include(&[
+                "sp1-stark",
+                "sp1-primitives",
+                "sp1-core-machine",
+                "p3-baby-bear",
+                "sp1-core-executor",
+            ])
+            .with_parse_extra_bindings(&["sp1-stark", "sp1-primitives", "p3-baby-bear"])
+            .rename_item("BabyBear", "BabyBearP3")
+            .include_item("MemoryRecord") // Just for convenience. Not exposed, so we need to manually do this.
+            .include_item("SyscallCode") // Required for populating the CPU columns for ECALL.
+            .include_item("SepticExtension")
+            .include_item("SepticCurve")
+            .include_item("MemoryLocalCols")
+            .include_item("MEMORY_LOCAL_INITIAL_DIGEST_POS")
+            .include_item("Ghost")
+            .include_item("MemoryInitCols")
+            .include_item("MemoryInitializeFinalizeEvent")
+            .with_namespace("sp1_core_machine_sys")
+            .with_crate(crate_dir)
+            .generate()
+        {
+            Ok(bindings) => {
+                // Write the bindings to the target include directory.
+                let header_path = target_include_dir.join(cbindgen_hpp);
+                if bindings.write_to_file(&header_path) {
+                    // Symlink the header to the fixed include directory.
+                    rel_symlink_file(header_path, target_include_dir_fixed.join(cbindgen_hpp));
+                }
+            }
+            Err(cbindgen::Error::ParseSyntaxError { .. }) => {} // Ignore parse errors so rust-analyzer can run.
+            Err(e) => panic!("{:?}", e),
+        }
+
+        // Copy the headers to the include directory and symlink them to the fixed include directory.
+        for header in &headers {
+            // Get the path of the header relative to the source include directory.
+            let relpath = diff_paths(header, &source_include_dir).unwrap();
+
+            // Let the destination path be the same place relative to the target include directory.
+            let dst = target_include_dir.join(&relpath);
+
+            // Create the parent directory if it does not exist.
+            if let Some(parent) = dst.parent() {
+                fs::create_dir_all(parent).unwrap();
+            }
+            fs::copy(header, &dst).unwrap();
+            rel_symlink_file(dst, target_include_dir_fixed.join(relpath));
+        }
+
+        // Use the `cc` crate to build the library and statically link it to the crate.
+        let mut cc_builder = cc::Build::new();
+        cc_builder.files(&compilation_units).include(target_include_dir);
+        cc_builder.cpp(true).std("c++20");
+        cc_builder.compile(LIB_NAME)
+    }
+
+    /// Place a relative symlink pointing to `original` at `link`.
+    fn rel_symlink_file<P, Q>(original: P, link: Q)
+    where
+        P: AsRef<Path>,
+        Q: AsRef<Path>,
+    {
+        #[cfg(unix)]
+        use os::unix::fs::symlink;
+        #[cfg(windows)]
+        use os::windows::fs::symlink_file as symlink;
+
+        let target_dir = link.as_ref().parent().unwrap();
+        fs::create_dir_all(target_dir).unwrap();
+        let _ = fs::remove_file(&link);
+        let relpath = diff_paths(original, target_dir).unwrap();
+        symlink(relpath, link).unwrap();
+    }
+}
diff --git a/crates/core/machine/cpp/extern.cpp b/crates/core/machine/cpp/extern.cpp
new file mode 100644
index 0000000000..509f5998a7
--- /dev/null
+++ b/crates/core/machine/cpp/extern.cpp
@@ -0,0 +1,28 @@
+#include "bb31_t.hpp"
+#include "bb31_septic_extension_t.hpp"
+#include "sys.hpp"
+
+namespace sp1_core_machine_sys {
+extern void add_sub_event_to_row_babybear(
+    const AluEvent* event,
+    AddSubCols<BabyBearP3>* cols
+) {
+    AddSubCols<bb31_t>* cols_bb31 = reinterpret_cast<AddSubCols<bb31_t>*>(cols);
+    add_sub::event_to_row<bb31_t>(*event, *cols_bb31);
+}
+
+extern void memory_local_event_to_row_babybear(const MemoryLocalEvent* event, SingleMemoryLocal<BabyBearP3>* cols) {
+    SingleMemoryLocal<bb31_t>* cols_bb31 = reinterpret_cast<SingleMemoryLocal<bb31_t>*>(cols);
+    memory_local::event_to_row<bb31_t, bb31_septic_extension_t>(event, cols_bb31);
+}
+
+extern void memory_global_event_to_row_babybear(const MemoryInitializeFinalizeEvent* event, const bool is_receive, MemoryInitCols<BabyBearP3>* cols) {
+    MemoryInitCols<bb31_t>* cols_bb31 = reinterpret_cast<MemoryInitCols<bb31_t>*>(cols);
+    memory_global::event_to_row<bb31_t, bb31_septic_extension_t>(event, is_receive, cols_bb31);
+}
+
+extern void syscall_event_to_row_babybear(const SyscallEvent* event, const bool is_receive, SyscallCols<BabyBearP3>* cols) {
+    SyscallCols<bb31_t>* cols_bb31 = reinterpret_cast<SyscallCols<bb31_t>*>(cols);
+    syscall::event_to_row<bb31_t, bb31_septic_extension_t>(event, is_receive, cols_bb31);
+}
+} // namespace sp1_core_machine_sys
diff --git a/crates/core/machine/include/add_sub.hpp b/crates/core/machine/include/add_sub.hpp
new file mode 100644
index 0000000000..ee98c21b4f
--- /dev/null
+++ b/crates/core/machine/include/add_sub.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "prelude.hpp"
+#include "utils.hpp"
+
+namespace sp1_core_machine_sys::add_sub {
+template<class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ uint32_t
+populate(AddOperation<F>& op, const uint32_t a_u32, const uint32_t b_u32) {
+    array_t<uint8_t, 4> a = u32_to_le_bytes(a_u32);
+    array_t<uint8_t, 4> b = u32_to_le_bytes(b_u32);
+    bool carry = a[0] + b[0] > 0xFF;
+    op.carry[0] = F::from_bool(carry).val;
+    carry = a[1] + b[1] + carry > 0xFF;
+    op.carry[1] = F::from_bool(carry).val;
+    carry = a[2] + b[2] + carry > 0xFF;
+    op.carry[2] = F::from_bool(carry).val;
+
+    uint32_t expected = a_u32 + b_u32;
+    write_word_from_u32_v2<F>(op.value, expected);
+    return expected;
+}
+
+template<class F>
+__SP1_HOSTDEV__ void event_to_row(const AluEvent& event, AddSubCols<F>& cols) {
+    bool is_add = event.opcode == Opcode::ADD;
+    cols.shard = F::from_canonical_u32(event.shard);
+    cols.is_add = F::from_bool(is_add);
+    cols.is_sub = F::from_bool(!is_add);
+
+    auto operand_1 = is_add ? event.b : event.a;
+    auto operand_2 = event.c;
+
+    populate<F>(cols.add_operation, operand_1, operand_2);
+    write_word_from_u32_v2<F>(cols.operand_1, operand_1);
+    write_word_from_u32_v2<F>(cols.operand_2, operand_2);
+}
+}  // namespace sp1::add_sub
\ No newline at end of file
diff --git a/crates/core/machine/include/bb31_septic_extension_t.hpp b/crates/core/machine/include/bb31_septic_extension_t.hpp
new file mode 100644
index 0000000000..9737d8bb12
--- /dev/null
+++ b/crates/core/machine/include/bb31_septic_extension_t.hpp
@@ -0,0 +1,511 @@
+#pragma once
+
+#include "prelude.hpp"
+#include "bb31_t.hpp"
+#include <cstdio>
+
+#ifdef __CUDA_ARCH__
+#define FUN __host__ __device__
+#endif
+#ifndef __CUDA_ARCH__
+#define FUN inline
+#endif
+
+class bb31_cipolla_t {
+    public:
+        bb31_t real;
+        bb31_t imag;
+
+        FUN bb31_cipolla_t(bb31_t real, bb31_t imag) {
+            this->real = bb31_t(real);
+            this->imag = bb31_t(imag);
+        }
+
+        FUN static bb31_cipolla_t one() {
+            return bb31_cipolla_t(bb31_t::one(), bb31_t::zero());
+        }
+
+        FUN bb31_cipolla_t mul_ext(bb31_cipolla_t other, bb31_t nonresidue) {
+            bb31_t new_real = real * other.real + nonresidue * imag * other.imag;
+            bb31_t new_imag = real * other.imag + imag * other.real;
+            return bb31_cipolla_t(new_real, new_imag);
+        }
+
+        FUN bb31_cipolla_t pow(uint32_t exponent, bb31_t nonresidue) {
+            bb31_cipolla_t result = bb31_cipolla_t::one();
+            bb31_cipolla_t base = *this;
+
+            while(exponent) {
+                if(exponent & 1) {
+                    result = result.mul_ext(base, nonresidue);
+                }
+                exponent >>= 1;
+                base = base.mul_ext(base, nonresidue);
+            }
+
+            return result;
+        }
+};
+
+namespace constants {
+    #ifdef __CUDA_ARCH__
+        __constant__ constexpr const bb31_t frobenius_const[49] = {
+            bb31_t(int(1)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)),
+            bb31_t(int(954599710)), bb31_t(int(1359279693)), bb31_t(int(566669999)), bb31_t(int(1982781815)), bb31_t(int(1735718361)), bb31_t(int(1174868538)), bb31_t(int(1120871770)),
+            bb31_t(int(862825265)), bb31_t(int(597046311)), bb31_t(int(978840770)), bb31_t(int(1790138282)), bb31_t(int(1044777201)), bb31_t(int(835869808)), bb31_t(int(1342179023)),
+            bb31_t(int(596273169)), bb31_t(int(658837454)), bb31_t(int(1515468261)), bb31_t(int(367059247)), bb31_t(int(781278880)), bb31_t(int(1544222616)), bb31_t(int(155490465)),
+            bb31_t(int(557608863)), bb31_t(int(1173670028)), bb31_t(int(1749546888)), bb31_t(int(1086464137)), bb31_t(int(803900099)), bb31_t(int(1288818584)), bb31_t(int(1184677604)),
+            bb31_t(int(763416381)), bb31_t(int(1252567168)), bb31_t(int(628856225)), bb31_t(int(1771903394)), bb31_t(int(650712211)), bb31_t(int(19417363)), bb31_t(int(57990258)),
+            bb31_t(int(1734711039)), bb31_t(int(1749813853)), bb31_t(int(1227235221)), bb31_t(int(1707730636)), bb31_t(int(424560395)), bb31_t(int(1007029514)), bb31_t(int(498034669)),
+        };
+
+        __constant__ constexpr const bb31_t double_frobenius_const[49] = {
+            bb31_t(int(1)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)),
+            bb31_t(int(1013489358)), bb31_t(int(1619071628)), bb31_t(int(304593143)), bb31_t(int(1949397349)), bb31_t(int(1564307636)), bb31_t(int(327761151)), bb31_t(int(415430835)),
+            bb31_t(int(209824426)), bb31_t(int(1313900768)), bb31_t(int(38410482)), bb31_t(int(256593180)), bb31_t(int(1708830551)), bb31_t(int(1244995038)), bb31_t(int(1555324019)),
+            bb31_t(int(1475628651)), bb31_t(int(777565847)), bb31_t(int(704492386)), bb31_t(int(1218528120)), bb31_t(int(1245363405)), bb31_t(int(475884575)), bb31_t(int(649166061)),
+            bb31_t(int(550038364)), bb31_t(int(948935655)), bb31_t(int(68722023)), bb31_t(int(1251345762)), bb31_t(int(1692456177)), bb31_t(int(1177958698)), bb31_t(int(350232928)),
+            bb31_t(int(882720258)), bb31_t(int(821925756)), bb31_t(int(199955840)), bb31_t(int(812002876)), bb31_t(int(1484951277)), bb31_t(int(1063138035)), bb31_t(int(491712810)),
+            bb31_t(int(738287111)), bb31_t(int(1955364991)), bb31_t(int(552724293)), bb31_t(int(1175775744)), bb31_t(int(341623997)), bb31_t(int(1454022463)), bb31_t(int(408193320))
+        };
+
+        __constant__ constexpr const bb31_t A_EC_LOGUP[7] = {bb31_t(int(0x31415926)), bb31_t(int(0x53589793)), bb31_t(int(0x23846264)), bb31_t(int(0x33832795)), bb31_t(int(0x02884197)), bb31_t(int(0x16939937)), bb31_t(int(0x51058209))};
+
+        __constant__ constexpr const bb31_t B_EC_LOGUP[7] = {bb31_t(int(0x74944592)), bb31_t(int(0x30781640)), bb31_t(int(0x62862089)), bb31_t(int(0x9862803)), bb31_t(int(0x48253421)), bb31_t(int(0x17067982)), bb31_t(int(0x14808651))};
+
+        __constant__ constexpr const bb31_t dummy_x[7] = {bb31_t(int(0x2738281)), bb31_t(int(0x8284590)), bb31_t(int(0x4523536)), bb31_t(int(0x0287471)), bb31_t(int(0x3526624)), bb31_t(int(0x9775724)), bb31_t(int(0x7093699))};
+        __constant__ constexpr const bb31_t dummy_y[7] = {bb31_t(int(48041908)), bb31_t(int(550064556)), bb31_t(int(415267377)), bb31_t(int(1726976249)), bb31_t(int(1253299140)), bb31_t(int(209439863)), bb31_t(int(1302309485))};
+
+        __constant__ constexpr bb31_t start_x[7] = {bb31_t(int(0x1434213)), bb31_t(int(0x5623730)), bb31_t(int(0x9504880)), bb31_t(int(0x1688724)), bb31_t(int(0x2096980)), bb31_t(int(0x7856967)), bb31_t(int(0x1875376))};
+        __constant__ constexpr bb31_t start_y[7] = {bb31_t(int(885797405)), bb31_t(int(1130275556)), bb31_t(int(567836311)), bb31_t(int(52700240)), bb31_t(int(239639200)), bb31_t(int(442612155)), bb31_t(int(1839439733))};
+
+    #endif
+
+    #ifndef __CUDA_ARCH__
+        static constexpr const bb31_t frobenius_const[49] = {
+            bb31_t(int(1)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)),
+            bb31_t(int(954599710)), bb31_t(int(1359279693)), bb31_t(int(566669999)), bb31_t(int(1982781815)), bb31_t(int(1735718361)), bb31_t(int(1174868538)), bb31_t(int(1120871770)),
+            bb31_t(int(862825265)), bb31_t(int(597046311)), bb31_t(int(978840770)), bb31_t(int(1790138282)), bb31_t(int(1044777201)), bb31_t(int(835869808)), bb31_t(int(1342179023)),
+            bb31_t(int(596273169)), bb31_t(int(658837454)), bb31_t(int(1515468261)), bb31_t(int(367059247)), bb31_t(int(781278880)), bb31_t(int(1544222616)), bb31_t(int(155490465)),
+            bb31_t(int(557608863)), bb31_t(int(1173670028)), bb31_t(int(1749546888)), bb31_t(int(1086464137)), bb31_t(int(803900099)), bb31_t(int(1288818584)), bb31_t(int(1184677604)),
+            bb31_t(int(763416381)), bb31_t(int(1252567168)), bb31_t(int(628856225)), bb31_t(int(1771903394)), bb31_t(int(650712211)), bb31_t(int(19417363)), bb31_t(int(57990258)),
+            bb31_t(int(1734711039)), bb31_t(int(1749813853)), bb31_t(int(1227235221)), bb31_t(int(1707730636)), bb31_t(int(424560395)), bb31_t(int(1007029514)), bb31_t(int(498034669))
+        };
+
+        static constexpr const bb31_t double_frobenius_const[49] = {
+            bb31_t(int(1)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)), bb31_t(int(0)),
+            bb31_t(int(1013489358)), bb31_t(int(1619071628)), bb31_t(int(304593143)), bb31_t(int(1949397349)), bb31_t(int(1564307636)), bb31_t(int(327761151)), bb31_t(int(415430835)),
+            bb31_t(int(209824426)), bb31_t(int(1313900768)), bb31_t(int(38410482)), bb31_t(int(256593180)), bb31_t(int(1708830551)), bb31_t(int(1244995038)), bb31_t(int(1555324019)),
+            bb31_t(int(1475628651)), bb31_t(int(777565847)), bb31_t(int(704492386)), bb31_t(int(1218528120)), bb31_t(int(1245363405)), bb31_t(int(475884575)), bb31_t(int(649166061)),
+            bb31_t(int(550038364)), bb31_t(int(948935655)), bb31_t(int(68722023)), bb31_t(int(1251345762)), bb31_t(int(1692456177)), bb31_t(int(1177958698)), bb31_t(int(350232928)),
+            bb31_t(int(882720258)), bb31_t(int(821925756)), bb31_t(int(199955840)), bb31_t(int(812002876)), bb31_t(int(1484951277)), bb31_t(int(1063138035)), bb31_t(int(491712810)),
+            bb31_t(int(738287111)), bb31_t(int(1955364991)), bb31_t(int(552724293)), bb31_t(int(1175775744)), bb31_t(int(341623997)), bb31_t(int(1454022463)), bb31_t(int(408193320))
+        };
+
+        static constexpr const bb31_t A_EC_LOGUP[7] = {bb31_t(int(0x31415926)), bb31_t(int(0x53589793)), bb31_t(int(0x23846264)), bb31_t(int(0x33832795)), bb31_t(int(0x02884197)), bb31_t(int(0x16939937)), bb31_t(int(0x51058209))};
+        static constexpr const bb31_t B_EC_LOGUP[7] = {bb31_t(int(0x74944592)), bb31_t(int(0x30781640)), bb31_t(int(0x62862089)), bb31_t(int(0x9862803)), bb31_t(int(0x48253421)), bb31_t(int(0x17067982)), bb31_t(int(0x14808651))};
+
+        static constexpr bb31_t dummy_x[7] = {bb31_t(int(0x2738281)), bb31_t(int(0x8284590)), bb31_t(int(0x4523536)), bb31_t(int(0x0287471)), bb31_t(int(0x3526624)), bb31_t(int(0x9775724)), bb31_t(int(0x7093699))};
+        static constexpr bb31_t dummy_y[7] = {bb31_t(int(48041908)), bb31_t(int(550064556)), bb31_t(int(415267377)), bb31_t(int(1726976249)), bb31_t(int(1253299140)), bb31_t(int(209439863)), bb31_t(int(1302309485))};
+
+        static constexpr bb31_t start_x[7] = {bb31_t(int(0x1434213)), bb31_t(int(0x5623730)), bb31_t(int(0x9504880)), bb31_t(int(0x1688724)), bb31_t(int(0x2096980)), bb31_t(int(0x7856967)), bb31_t(int(0x1875376))};
+        static constexpr bb31_t start_y[7] = {bb31_t(int(885797405)), bb31_t(int(1130275556)), bb31_t(int(567836311)), bb31_t(int(52700240)), bb31_t(int(239639200)), bb31_t(int(442612155)), bb31_t(int(1839439733))};
+
+    #endif     
+}   
+
+class bb31_septic_extension_t {
+    // The value of BabyBear septic extension element.
+    public:
+        bb31_t value[7];    
+        static constexpr const bb31_t* frobenius_const = constants::frobenius_const;
+        static constexpr const bb31_t* double_frobenius_const = constants::double_frobenius_const;
+        static constexpr const bb31_t* A_EC_LOGUP = constants::A_EC_LOGUP;
+        static constexpr const bb31_t* B_EC_LOGUP = constants::B_EC_LOGUP;
+
+        FUN bb31_septic_extension_t() {
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                this->value[i] = bb31_t(0);
+            }
+        } 
+
+        FUN bb31_septic_extension_t(bb31_t value) {
+            this->value[0] = value;
+            for (uintptr_t i = 1 ; i < 7 ; i++) {
+                this->value[i] = bb31_t(0);
+            }
+        }
+
+        FUN bb31_septic_extension_t(bb31_t value[7]) {
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                this->value[i] = value[i];
+            }
+        }
+
+        FUN bb31_septic_extension_t(const bb31_t value[7]) {
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                this->value[i] = value[i];
+            }
+        }
+
+        static FUN bb31_septic_extension_t zero() {
+            return bb31_septic_extension_t();
+        }
+
+        static FUN bb31_septic_extension_t one() {
+            return bb31_septic_extension_t(bb31_t::one());
+        }
+
+        static FUN bb31_septic_extension_t two() {
+            return bb31_septic_extension_t(bb31_t::two());
+        }
+
+        static FUN bb31_septic_extension_t from_canonical_u32(uint32_t n) {
+            return bb31_septic_extension_t(bb31_t::from_canonical_u32(n));
+        }
+
+        FUN bb31_septic_extension_t& operator+=(const bb31_t b) {
+            value[0] += b;
+            return *this;
+        }
+
+        friend FUN bb31_septic_extension_t operator+(bb31_septic_extension_t a, const bb31_t b) {
+            return a += b;
+        }
+
+        FUN bb31_septic_extension_t& operator+=(const bb31_septic_extension_t b) {
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                value[i] += b.value[i];
+            }
+            return *this;
+        }
+
+        friend FUN bb31_septic_extension_t operator+(bb31_septic_extension_t a, const bb31_septic_extension_t b) {
+            return a += b;
+        }
+
+        FUN bb31_septic_extension_t& operator-=(const bb31_t b) {
+            value[0] -= b;
+            return *this;
+        }
+
+        friend FUN bb31_septic_extension_t operator-(bb31_septic_extension_t a, const bb31_t b) {
+            return a -= b;
+        }
+
+        FUN bb31_septic_extension_t& operator-=(const bb31_septic_extension_t b) {
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                value[i] -= b.value[i];
+            }
+            return *this;
+        }
+
+        friend FUN bb31_septic_extension_t operator-(bb31_septic_extension_t a, const bb31_septic_extension_t b) {
+            return a -= b;
+        }
+
+        FUN bb31_septic_extension_t& operator*=(const bb31_t b) {
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                value[i] *= b;
+            }
+            return *this;
+        }
+
+        friend FUN bb31_septic_extension_t operator*(bb31_septic_extension_t a, const bb31_t b) {
+            return a *= b;
+        }
+
+        FUN bb31_septic_extension_t& operator*=(const bb31_septic_extension_t b) {
+            {
+                bb31_t res[13] = {};
+                for(uintptr_t i = 0 ; i < 13 ; i++) {
+                    res[i] = bb31_t::zero();
+                }
+                for(uintptr_t i = 0 ; i < 7 ; i++) {
+                    for(uintptr_t j = 0 ; j < 7 ; j++) {
+                        res[i + j] += value[i] * b.value[j];
+                    }
+                }
+                for(uintptr_t i = 7 ; i < 13 ; i++) {
+                    res[i - 7] += res[i] * bb31_t::from_canonical_u32(5);
+                    res[i - 6] += res[i] * bb31_t::from_canonical_u32(2);
+                }
+                for(uintptr_t i = 0 ; i < 7 ; i++) {
+                    value[i] = res[i];
+                }
+            }
+            return *this;
+        }  
+
+        friend FUN bb31_septic_extension_t operator*(bb31_septic_extension_t a, const bb31_septic_extension_t b) {
+            return a *= b;
+        }
+
+        FUN bool operator==(const bb31_septic_extension_t rhs) const {
+             for(uintptr_t i = 0 ; i < 7 ; i++) {
+                if(value[i] != rhs.value[i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        FUN bb31_septic_extension_t frobenius() const {
+            bb31_t res[7] = {};
+            res[0] = value[0];
+            for(uintptr_t i = 1 ; i < 7 ; i++) {
+                res[i] = bb31_t::zero();
+            }
+            for(uintptr_t i = 1 ; i < 7 ; i++) {
+                for(uintptr_t j = 0 ; j < 7 ; j++) {
+                    res[j] += value[i] * frobenius_const[7 * i + j];
+                }
+            }
+            return bb31_septic_extension_t(res);
+
+        }
+
+        FUN bb31_septic_extension_t double_frobenius() const {
+            bb31_t res[7] = {};
+            res[0] = value[0];
+            for(uintptr_t i = 1 ; i < 7 ; i++) {
+                res[i] = bb31_t::zero();
+            }
+            for(uintptr_t i = 1 ; i < 7 ; i++) {
+                for(uintptr_t j = 0 ; j < 7 ; j++) {
+                    res[j] += value[i] * double_frobenius_const[7 * i + j];
+                }
+            }
+            return bb31_septic_extension_t(res);
+
+        }
+
+        FUN bb31_septic_extension_t pow_r_1() const {
+            bb31_septic_extension_t base = frobenius();
+            base *= double_frobenius();
+            bb31_septic_extension_t base_p2 = base.double_frobenius();
+            bb31_septic_extension_t base_p4 = base_p2.double_frobenius();
+            return base * base_p2 * base_p4;
+        }
+
+        FUN bb31_t pow_r() const {
+            bb31_septic_extension_t pow_r1 = pow_r_1();
+            bb31_septic_extension_t pow_r = pow_r1 * *this;
+            return pow_r.value[0];
+        }
+
+        FUN bb31_septic_extension_t reciprocal() const {
+            bb31_septic_extension_t pow_r1 = pow_r_1();
+            bb31_septic_extension_t pow_r = pow_r1 * *this;
+            return pow_r1 * pow_r.value[0].reciprocal();
+        }
+
+        friend FUN bb31_septic_extension_t operator/(bb31_septic_extension_t a, bb31_septic_extension_t b) {
+            return a * b.reciprocal();
+        }
+
+        FUN bb31_septic_extension_t& operator/=(const bb31_septic_extension_t a) {
+            return *this *= a.reciprocal();
+        }
+
+        FUN bb31_septic_extension_t sqrt(bb31_t pow_r) const {
+            if (*this == bb31_septic_extension_t::zero()) {
+                return *this;
+            }
+
+            bb31_septic_extension_t n_iter = *this;
+            bb31_septic_extension_t n_power = *this;
+            for(uintptr_t i = 1 ; i < 30 ; i++) {
+                n_iter *= n_iter;
+                if(i >= 26) {
+                    n_power *= n_iter;
+                }
+            }
+
+            bb31_septic_extension_t n_frobenius = n_power.frobenius();
+            bb31_septic_extension_t denominator = n_frobenius;
+
+            n_frobenius = n_frobenius.double_frobenius();
+            denominator *= n_frobenius;
+            n_frobenius = n_frobenius.double_frobenius();
+            denominator *= n_frobenius;
+            denominator *= *this;
+
+            bb31_t base = pow_r.reciprocal();
+            bb31_t g = bb31_t::from_canonical_u32(31);
+            bb31_t a = bb31_t::one();
+            bb31_t nonresidue = bb31_t::one() - base;
+
+            while (true) {
+                bb31_t is_square = nonresidue ^ 1006632960;
+                if (is_square != bb31_t::one()) {
+                    break;
+                }
+                a *= g;
+                nonresidue = a.square() - base;
+            }
+
+            bb31_cipolla_t x = bb31_cipolla_t(a, bb31_t::one());
+            x = x.pow(1006632961, nonresidue);
+
+            return denominator * x.real;
+        }
+
+        FUN bb31_septic_extension_t universal_hash() const {
+            return *this * bb31_septic_extension_t(A_EC_LOGUP) + bb31_septic_extension_t(B_EC_LOGUP);
+        }
+
+        FUN bb31_septic_extension_t curve_formula() const {
+            bb31_septic_extension_t result = *this * *this * *this;
+            result += *this;
+            result += *this;
+            result.value[5] += bb31_t::from_canonical_u32(26);
+            return result;
+        }
+
+        FUN bool is_receive() const {
+            uint32_t limb = value[6].as_canonical_u32();
+            return 1 <= limb && limb <= (bb31_t::MOD - 1) / 2;
+        }
+
+        FUN bool is_send() const {
+            uint32_t limb = value[6].as_canonical_u32();
+            return (bb31_t::MOD + 1) / 2 <= limb && limb <= (bb31_t::MOD - 1);
+        }
+
+        FUN bool is_exception() const {
+            return value[6] == bb31_t::zero();
+        }
+};
+
+
+class bb31_septic_curve_t {
+    public:
+        bb31_septic_extension_t x;
+        bb31_septic_extension_t y;
+
+        static constexpr const bb31_t* dummy_x = constants::dummy_x;
+        static constexpr const bb31_t* dummy_y = constants::dummy_y;
+        static constexpr const bb31_t* start_x = constants::start_x;
+        static constexpr const bb31_t* start_y = constants::start_y;
+        
+        FUN bb31_septic_curve_t() {
+            this->x = bb31_septic_extension_t::zero();
+            this->y = bb31_septic_extension_t::zero();
+        }
+
+        FUN bb31_septic_curve_t(bb31_septic_extension_t x, bb31_septic_extension_t y) {
+            this->x = x;
+            this->y = y;
+        }
+
+        FUN bb31_septic_curve_t(bb31_t value[14]) {
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                this->x.value[i] = value[i];
+            }
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                this->y.value[i] = value[i + 7];
+            }
+        }
+
+        FUN bb31_septic_curve_t(bb31_t value_x[7], bb31_t value_y[7]) {
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                this->x.value[i] = value_x[i];
+                this->y.value[i] = value_y[i];
+            }
+        }
+
+        static FUN bb31_septic_curve_t dummy_point() {
+            bb31_septic_extension_t x;
+            bb31_septic_extension_t y;
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                x.value[i] = dummy_x[i];
+                y.value[i] = dummy_y[i];
+            }
+            return bb31_septic_curve_t(x, y);
+        }
+
+        static FUN bb31_septic_curve_t start_point() {
+            bb31_septic_extension_t x;
+            bb31_septic_extension_t y;
+            for (uintptr_t i = 0 ; i < 7 ; i++) {
+                x.value[i] = start_x[i];
+                y.value[i] = start_y[i];
+            }
+            return bb31_septic_curve_t(x, y);
+        }
+
+        FUN bool is_infinity() const {
+            return x == bb31_septic_extension_t::zero() && y == bb31_septic_extension_t::zero();
+        }
+
+        FUN bb31_septic_curve_t& operator+=(const bb31_septic_curve_t b) {
+            if (b.is_infinity()) {
+                return *this;
+            }
+            if (is_infinity()) {
+                x = b.x;
+                y = b.y;
+                return *this;
+            }
+            if (x == b.x) {
+                if (y == b.y) {
+                    bb31_septic_extension_t slope = (x * x * bb31_t::from_canonical_u8(3) + bb31_t::two()) / (y * bb31_t::two());
+                    bb31_septic_extension_t result_x = slope * slope - x - b.x;
+                    bb31_septic_extension_t result_y = slope * (x - result_x) - y;
+                    x = result_x;
+                    y = result_y;
+                    return *this;
+                }
+                else {
+                    x = bb31_septic_extension_t::zero();
+                    y = bb31_septic_extension_t::zero();
+                    return *this;
+                }
+            }
+            else {
+                bb31_septic_extension_t slope = (b.y - y) / (b.x - x);
+                bb31_septic_extension_t result_x = slope * slope - x - b.x;
+                bb31_septic_extension_t result_y = slope * (x - result_x) - y;
+                x = result_x;
+                y = result_y;
+                return *this;
+            }
+        }
+
+        friend FUN bb31_septic_curve_t operator+(bb31_septic_curve_t a, const bb31_septic_curve_t b) {
+            return a += b;
+        }
+
+        static FUN bb31_septic_extension_t sum_checker_x(
+            const bb31_septic_curve_t& p1,
+            const bb31_septic_curve_t& p2,
+            const bb31_septic_curve_t& p3
+        ) {
+            bb31_septic_extension_t x_diff = p2.x - p1.x;
+            bb31_septic_extension_t y_diff = p2.y - p1.y;
+            return (p1.x + p2.x + p3.x) * x_diff * x_diff - y_diff * y_diff;
+        }
+};
+
+class bb31_septic_digest_t {
+    public:
+        bb31_septic_curve_t point;
+
+        FUN bb31_septic_digest_t() {
+            this->point = bb31_septic_curve_t();
+        }
+
+        FUN bb31_septic_digest_t(bb31_t value[14]) {
+            this->point = bb31_septic_curve_t(value);
+        }
+
+        FUN bb31_septic_digest_t(bb31_septic_extension_t x, bb31_septic_extension_t y) {
+            this->point = bb31_septic_curve_t(x, y);
+        }
+
+        FUN bb31_septic_digest_t(bb31_septic_curve_t point) {
+            this->point = point;
+        }
+};
+
diff --git a/crates/core/machine/include/bb31_t.hpp b/crates/core/machine/include/bb31_t.hpp
new file mode 100644
index 0000000000..c9b387ee30
--- /dev/null
+++ b/crates/core/machine/include/bb31_t.hpp
@@ -0,0 +1,644 @@
+// Modified by Succinct Labs
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+
+#ifdef __CUDA_ARCH__
+
+#define inline __device__ __forceinline__
+#ifdef __GNUC__
+#define asm __asm__ __volatile__
+#else
+#define asm asm volatile
+#endif
+
+class bb31_t {
+ public:
+  using mem_t = bb31_t;
+  uint32_t val;
+  static const uint32_t DEGREE = 1;
+  static const uint32_t NBITS = 31;
+  static const uint32_t MOD = 0x78000001u;
+  static const uint32_t M = 0x77ffffffu;
+  static const uint32_t RR = 0x45dddde3u;
+  static const uint32_t ONE = 0x0ffffffeu;
+  static const uint32_t MONTY_BITS = 32;
+  static const uint32_t MONTY_MU = 0x88000001;
+  static const uint32_t MONTY_MASK = ((1ULL << MONTY_BITS) - 1);
+
+  static constexpr size_t __device__ bit_length() { return 31; }
+
+  inline uint32_t& operator[](size_t i) { return val; }
+
+  inline uint32_t& operator*() { return val; }
+
+  inline const uint32_t& operator[](size_t i) const { return val; }
+
+  inline uint32_t operator*() const { return val; }
+
+  inline size_t len() const { return 1; }
+
+  inline bb31_t() {}
+
+  inline constexpr bb31_t(const uint32_t a) { val = a; }
+
+  inline bb31_t(const uint32_t* p) { val = *p; }
+
+  inline constexpr bb31_t(int a) : val(((uint64_t)a << 32) % MOD) {}
+
+  static inline const bb31_t zero() { return bb31_t(0); }
+
+  static inline const bb31_t one() { return bb31_t(ONE); }
+
+  static inline const bb31_t two() { return from_canonical_u32(2); }
+
+  static inline constexpr uint32_t to_monty(uint32_t x) {
+    return (((uint64_t)x << MONTY_BITS) % MOD);
+  }
+
+  static inline uint32_t monty_reduce(uint64_t x) {
+    uint64_t t = (x * (uint64_t)MONTY_MU) & (uint64_t)MONTY_MASK;
+    uint64_t u = t * (uint64_t)MOD;
+    uint64_t x_sub_u = x - u;
+    bool over = x < u;
+    uint32_t x_sub_u_hi = (uint32_t)(x_sub_u >> MONTY_BITS);
+    uint32_t corr = over ? MOD : 0;
+    return x_sub_u_hi + corr;
+  }
+
+  static inline uint32_t from_monty(uint32_t x) {
+    return monty_reduce((uint64_t)x);
+  }
+
+  static inline bb31_t from_canonical_u32(uint32_t x) {
+    return bb31_t(to_monty(x));
+  }
+
+  static inline bb31_t from_canonical_u16(uint16_t x) {
+    return from_canonical_u32((uint32_t)x);
+  }
+
+  static inline bb31_t from_canonical_u8(uint8_t x) {
+    return from_canonical_u32((uint32_t)x);
+  }
+
+  static inline bb31_t from_bool(bool x) { return bb31_t(x * one().val); }
+
+  inline uint32_t as_canonical_u32() const {
+    return monty_reduce((uint64_t)val);
+  }
+
+  inline bb31_t exp_power_of_two(size_t log_power) {
+    bb31_t ret = *this;
+    for (size_t i = 0; i < log_power; i++) {
+      ret *= ret;
+    }
+    return ret;
+  }
+
+  inline bb31_t& operator+=(const bb31_t b) {
+    val += b.val;
+    final_sub(val);
+
+    return *this;
+  }
+
+  friend inline bb31_t operator+(bb31_t a, const bb31_t b) { return a += b; }
+
+  inline bb31_t& operator<<=(uint32_t l) {
+    while (l--) {
+      val <<= 1;
+      final_sub(val);
+    }
+
+    return *this;
+  }
+
+  friend inline bb31_t operator<<(bb31_t a, uint32_t l) { return a <<= l; }
+
+  inline bb31_t& operator>>=(uint32_t r) {
+    while (r--) {
+      val += val & 1 ? MOD : 0;
+      val >>= 1;
+    }
+
+    return *this;
+  }
+
+  friend inline bb31_t operator>>(bb31_t a, uint32_t r) { return a >>= r; }
+
+  inline bb31_t& operator-=(const bb31_t b) {
+    asm("{");
+    asm(".reg.pred %brw;");
+    asm("setp.lt.u32 %brw, %0, %1;" ::"r"(val), "r"(b.val));
+    asm("sub.u32 %0, %0, %1;" : "+r"(val) : "r"(b.val));
+    asm("@%brw add.u32 %0, %0, %1;" : "+r"(val) : "r"(MOD));
+    asm("}");
+
+    return *this;
+  }
+
+  friend inline bb31_t operator-(bb31_t a, const bb31_t b) { return a -= b; }
+
+  inline bb31_t cneg(bool flag) {
+    asm("{");
+    asm(".reg.pred %flag;");
+    asm("setp.ne.u32 %flag, %0, 0;" ::"r"(val));
+    asm("@%flag setp.ne.u32 %flag, %0, 0;" ::"r"((int)flag));
+    asm("@%flag sub.u32 %0, %1, %0;" : "+r"(val) : "r"(MOD));
+    asm("}");
+
+    return *this;
+  }
+
+  static inline bb31_t cneg(bb31_t a, bool flag) { return a.cneg(flag); }
+
+  inline bb31_t operator-() const { return cneg(*this, true); }
+
+  inline bool operator==(const bb31_t rhs) const { return val == rhs.val; }
+
+  inline bool is_one() const { return val == ONE; }
+
+  inline bool is_zero() const { return val == 0; }
+
+  inline void set_to_zero() { val = 0; }
+
+  friend inline bb31_t czero(const bb31_t a, int set_z) {
+    bb31_t ret;
+
+    asm("{");
+    asm(".reg.pred %set_z;");
+    asm("setp.ne.s32 %set_z, %0, 0;" : : "r"(set_z));
+    asm("selp.u32 %0, 0, %1, %set_z;" : "=r"(ret.val) : "r"(a.val));
+    asm("}");
+
+    return ret;
+  }
+
+  static inline bb31_t csel(const bb31_t a, const bb31_t b, int sel_a) {
+    bb31_t ret;
+
+    asm("{");
+    asm(".reg.pred %sel_a;");
+    asm("setp.ne.s32 %sel_a, %0, 0;" ::"r"(sel_a));
+    asm("selp.u32 %0, %1, %2, %sel_a;"
+        : "=r"(ret.val)
+        : "r"(a.val), "r"(b.val));
+    asm("}");
+
+    return ret;
+  }
+
+ private:
+  static inline void final_sub(uint32_t& val) {
+    asm("{");
+    asm(".reg.pred %p;");
+    asm("setp.ge.u32 %p, %0, %1;" ::"r"(val), "r"(MOD));
+    asm("@%p sub.u32 %0, %0, %1;" : "+r"(val) : "r"(MOD));
+    asm("}");
+  }
+
+  inline bb31_t& mul(const bb31_t b) {
+    uint32_t tmp[2], red;
+
+    asm("mul.lo.u32 %0, %2, %3; mul.hi.u32 %1, %2, %3;"
+        : "=r"(tmp[0]), "=r"(tmp[1])
+        : "r"(val), "r"(b.val));
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(red) : "r"(tmp[0]), "r"(M));
+    asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %4;"
+        : "+r"(tmp[0]), "=r"(val)
+        : "r"(red), "r"(MOD), "r"(tmp[1]));
+
+    final_sub(val);
+
+    return *this;
+  }
+
+  inline uint32_t mul_by_1() const {
+    uint32_t tmp[2], red;
+
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(red) : "r"(val), "r"(M));
+    asm("mad.lo.cc.u32 %0, %2, %3, %4; madc.hi.u32 %1, %2, %3, 0;"
+        : "=r"(tmp[0]), "=r"(tmp[1])
+        : "r"(red), "r"(MOD), "r"(val));
+    return tmp[1];
+  }
+
+ public:
+  friend inline bb31_t operator*(bb31_t a, const bb31_t b) { return a.mul(b); }
+
+  inline bb31_t& operator*=(const bb31_t a) { return mul(a); }
+
+  // raise to a variable power, variable in respect to threadIdx,
+  // but mind the ^ operator's precedence!
+  inline bb31_t& operator^=(uint32_t p) {
+    bb31_t sqr = *this;
+    *this = csel(val, ONE, p & 1);
+
+#pragma unroll 1
+    while (p >>= 1) {
+      sqr.mul(sqr);
+      if (p & 1)
+        mul(sqr);
+    }
+
+    return *this;
+  }
+
+  friend inline bb31_t operator^(bb31_t a, uint32_t p) {
+    return a ^= p;
+  }
+
+  inline bb31_t operator()(uint32_t p) {
+    return *this ^ p;
+  }
+
+  // raise to a constant power, e.g. x^7, to be unrolled at compile time
+  inline bb31_t& operator^=(int p) {
+    if (p < 2)
+      asm("trap;");
+
+    bb31_t sqr = *this;
+    if ((p & 1) == 0) {
+      do {
+        sqr.mul(sqr);
+        p >>= 1;
+      } while ((p & 1) == 0);
+      *this = sqr;
+    }
+    for (p >>= 1; p; p >>= 1) {
+      sqr.mul(sqr);
+      if (p & 1)
+        mul(sqr);
+    }
+
+    return *this;
+  }
+
+  friend inline bb31_t operator^(bb31_t a, int p) {
+    return a ^= p;
+  }
+
+  inline bb31_t operator()(int p) {
+    return *this ^ p;
+  }
+
+  inline bb31_t square() { return *this * *this; }
+
+  friend inline bb31_t sqr(bb31_t a) {
+    return a.sqr();
+  }
+
+  inline bb31_t& sqr() {
+    return mul(*this);
+  }
+
+  inline void to() {
+    mul(RR);
+  }
+
+  inline void from() {
+    val = mul_by_1();
+  }
+
+  template <size_t T>
+  static inline bb31_t dot_product(const bb31_t a[T], const bb31_t b[T]) {
+    uint32_t acc[2];
+    size_t i = 1;
+
+    asm("mul.lo.u32 %0, %2, %3; mul.hi.u32 %1, %2, %3;"
+        : "=r"(acc[0]), "=r"(acc[1])
+        : "r"(*a[0]), "r"(*b[0]));
+    if ((T & 1) == 0) {
+      asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+          : "+r"(acc[0]), "+r"(acc[1])
+          : "r"(*a[i]), "r"(*b[i]));
+      i++;
+    }
+    for (; i < T; i += 2) {
+      asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+          : "+r"(acc[0]), "+r"(acc[1])
+          : "r"(*a[i]), "r"(*b[i]));
+      asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+          : "+r"(acc[0]), "+r"(acc[1])
+          : "r"(*a[i + 1]), "r"(*b[i + 1]));
+      final_sub(acc[1]);
+    }
+
+    uint32_t red;
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(red) : "r"(acc[0]), "r"(M));
+    asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+        : "+r"(acc[0]), "+r"(acc[1])
+        : "r"(red), "r"(MOD));
+    final_sub(acc[1]);
+
+    return acc[1];
+  }
+
+  template <size_t T>
+  static inline bb31_t dot_product(bb31_t a0, bb31_t b0, const bb31_t a[T - 1],
+                                   const bb31_t* b, size_t stride_b = 1) {
+    uint32_t acc[2];
+    size_t i = 0;
+
+    asm("mul.lo.u32 %0, %2, %3; mul.hi.u32 %1, %2, %3;"
+        : "=r"(acc[0]), "=r"(acc[1])
+        : "r"(*a0), "r"(*b0));
+    if ((T & 1) == 0) {
+      asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+          : "+r"(acc[0]), "+r"(acc[1])
+          : "r"(*a[i]), "r"(*b[0]));
+      i++, b += stride_b;
+    }
+    for (; i < T - 1; i += 2) {
+      asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+          : "+r"(acc[0]), "+r"(acc[1])
+          : "r"(*a[i]), "r"(*b[0]));
+      b += stride_b;
+      asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+          : "+r"(acc[0]), "+r"(acc[1])
+          : "r"(*a[i + 1]), "r"(*b[0]));
+      b += stride_b;
+      final_sub(acc[1]);
+    }
+
+    uint32_t red;
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(red) : "r"(acc[0]), "r"(M));
+    asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
+        : "+r"(acc[0]), "+r"(acc[1])
+        : "r"(red), "r"(MOD));
+    final_sub(acc[1]);
+
+    return acc[1];
+  }
+
+ private:
+  static inline bb31_t sqr_n(bb31_t s, uint32_t n) {
+#if 0
+#pragma unroll 2
+        while (n--)
+            s.sqr();
+#else  // +20% [for reciprocal()]
+#pragma unroll 2
+    while (n--) {
+      uint32_t tmp[2], red;
+
+      asm("mul.lo.u32 %0, %2, %2; mul.hi.u32 %1, %2, %2;"
+          : "=r"(tmp[0]), "=r"(tmp[1])
+          : "r"(s.val));
+      asm("mul.lo.u32 %0, %1, %2;" : "=r"(red) : "r"(tmp[0]), "r"(M));
+      asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %4;"
+          : "+r"(tmp[0]), "=r"(s.val)
+          : "r"(red), "r"(MOD), "r"(tmp[1]));
+
+      if (n & 1)
+        final_sub(s.val);
+    }
+#endif
+    return s;
+  }
+
+  static inline bb31_t sqr_n_mul(bb31_t s, uint32_t n, bb31_t m) {
+    s = sqr_n(s, n);
+    s.mul(m);
+
+    return s;
+  }
+
+ public:
+  inline bb31_t reciprocal() const {
+    bb31_t x11, xff, ret = *this;
+
+    x11 = sqr_n_mul(ret, 4, ret);  // 0b10001
+    ret = sqr_n_mul(x11, 1, x11);  // 0b110011
+    ret = sqr_n_mul(ret, 1, x11);  // 0b1110111
+    xff = sqr_n_mul(ret, 1, x11);  // 0b11111111
+    ret = sqr_n_mul(ret, 8, xff);  // 0b111011111111111
+    ret = sqr_n_mul(ret, 8, xff);  // 0b11101111111111111111111
+    ret = sqr_n_mul(ret, 8, xff);  // 0b1110111111111111111111111111111
+
+    return ret;
+  }
+
+  friend inline bb31_t operator/(int one, bb31_t a) {
+    if (one != 1)
+      asm("trap;");
+    return a.reciprocal();
+  }
+
+  friend inline bb31_t operator/(bb31_t a, bb31_t b) {
+    return a * b.reciprocal();
+  }
+
+  inline bb31_t& operator/=(const bb31_t a) {
+    return *this *= a.reciprocal();
+  }
+
+  inline bb31_t heptaroot() const {
+    bb31_t x03, x18, x1b, ret = *this;
+
+    x03 = sqr_n_mul(ret, 1, ret);    // 0b11
+    x18 = sqr_n(x03, 3);             // 0b11000
+    x1b = x18 * x03;                 // 0b11011
+    ret = x18 * x1b;                 // 0b110011
+    ret = sqr_n_mul(ret, 6, x1b);    // 0b110011011011
+    ret = sqr_n_mul(ret, 6, x1b);    // 0b110011011011011011
+    ret = sqr_n_mul(ret, 6, x1b);    // 0b110011011011011011011011
+    ret = sqr_n_mul(ret, 6, x1b);    // 0b110011011011011011011011011011
+    ret = sqr_n_mul(ret, 1, *this);  // 0b1100110110110110110110110110111
+
+    return ret;
+  }
+
+  inline void shfl_bfly(uint32_t laneMask) {
+    val = __shfl_xor_sync(0xFFFFFFFF, val, laneMask);
+  }
+};
+
+#undef inline
+#undef asm
+// # endif // __CUDA__ARCH__
+
+#else
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+class bb31_t {
+ private:
+  static const uint32_t M = 0x77ffffffu;
+  static const uint32_t RR = 0x45dddde3u;
+  static const uint32_t ONE = 0x0ffffffeu;
+  static const uint32_t MONTY_BITS = 32;
+  static const uint32_t MONTY_MU = 0x88000001;
+  static const uint32_t MONTY_MASK = ((1ULL << MONTY_BITS) - 1);
+
+ public:
+  using mem_t = bb31_t;
+  uint32_t val;
+  static const uint32_t DEGREE = 1;
+  static const uint32_t NBITS = 31;
+  static const uint32_t MOD = 0x78000001;
+
+  inline constexpr bb31_t() {}
+
+  inline constexpr bb31_t(uint32_t a) : val(a) {}
+
+  inline constexpr bb31_t(int a) : val(((uint64_t)a << 32) % MOD) {}
+
+  static inline const bb31_t zero() { return bb31_t(0); }
+
+  static inline const bb31_t one() { return bb31_t(ONE); }
+
+  static inline const bb31_t two() { return bb31_t(to_monty(2)); }
+
+  static inline constexpr uint32_t to_monty(uint32_t x) {
+    return (((uint64_t)x << MONTY_BITS) % MOD);
+  }
+
+  static inline uint32_t from_monty(uint32_t x) {
+    return monty_reduce((uint64_t)x);
+  }
+
+  static inline uint32_t monty_reduce(uint64_t x) {
+    uint64_t t = (x * (uint64_t)MONTY_MU) & (uint64_t)MONTY_MASK;
+    uint64_t u = t * (uint64_t)MOD;
+    uint64_t x_sub_u = x - u;
+    bool over = x < u;
+    uint32_t x_sub_u_hi = (uint32_t)(x_sub_u >> MONTY_BITS);
+    uint32_t corr = over ? MOD : 0;
+    return x_sub_u_hi + corr;
+  }
+
+  static inline bb31_t from_canonical_u32(uint32_t x) {
+    assert(x < MOD);
+    return bb31_t(to_monty(x));
+  }
+
+  static inline bb31_t from_canonical_u16(uint16_t x) {
+    return from_canonical_u32((uint32_t)x);
+  }
+
+  static inline bb31_t from_canonical_u8(uint8_t x) {
+    return from_canonical_u32((uint32_t)x);
+  }
+
+  static inline bb31_t from_bool(bool x) { return bb31_t(x * one().val); }
+
+  inline uint32_t as_canonical_u32() const { return from_monty(val); }
+
+  inline bb31_t& operator+=(bb31_t b) {
+    val += b.val;
+    if (val >= MOD)
+      val -= MOD;
+    return *this;
+  }
+
+  inline bb31_t& operator-=(bb31_t b) {
+    if (val < b.val)
+      val += MOD;
+    val -= b.val;
+    return *this;
+  }
+
+  inline bb31_t& operator*=(bb31_t b) {
+    uint64_t long_prod = (uint64_t)val * (uint64_t)b.val;
+    val = monty_reduce(long_prod);
+    return *this;
+  }
+
+  inline bb31_t square() { return *this * *this; }
+
+  friend bb31_t operator+(bb31_t a, bb31_t b) { return a += b; }
+
+  friend bb31_t operator-(bb31_t a, bb31_t b) { return a -= b; }
+
+  friend bb31_t operator*(bb31_t a, bb31_t b) { return a *= b; }
+
+  inline bb31_t& operator<<=(uint32_t l) {
+    while (l--) {
+      val <<= 1;
+      if (val >= MOD)
+        val -= MOD;
+    }
+
+    return *this;
+  }
+
+  friend inline bb31_t operator<<(bb31_t a, uint32_t l) { return a <<= l; }
+
+  inline bb31_t& operator>>=(uint32_t r) {
+    while (r--) {
+      val += val & 1 ? MOD : 0;
+      val >>= 1;
+    }
+
+    return *this;
+  }
+
+  inline bb31_t exp_power_of_2(uint32_t power_log) const {
+    bb31_t result = *this;
+    for (uint32_t i = 0; i < power_log; ++i) {
+      result = result.square();
+    }
+    return result;
+  }
+
+  inline bb31_t reciprocal() const {
+    assert(*this != zero());
+
+    bb31_t p1 = *this;
+    bb31_t p100000000 = p1.exp_power_of_2(8);
+    bb31_t p100000001 = p100000000 * p1;
+    bb31_t p10000000000000000 = p100000000.exp_power_of_2(8);
+    bb31_t p10000000100000001 = p10000000000000000 * p100000001;
+    bb31_t p10000000100000001000 = p10000000100000001.exp_power_of_2(3);
+    bb31_t p1000000010000000100000000 = p10000000100000001000.exp_power_of_2(5);
+    bb31_t p1000000010000000100000001 = p1000000010000000100000000 * p1;
+    bb31_t p1000010010000100100001001 =
+        p1000000010000000100000001 * p10000000100000001000;
+    bb31_t p10000000100000001000000010 = p1000000010000000100000001.square();
+    bb31_t p11000010110000101100001011 =
+        p10000000100000001000000010 * p1000010010000100100001001;
+    bb31_t p100000001000000010000000100 = p10000000100000001000000010.square();
+    bb31_t p111000011110000111100001111 =
+        p100000001000000010000000100 * p11000010110000101100001011;
+    bb31_t p1110000111100001111000011110000 =
+        p111000011110000111100001111.exp_power_of_2(4);
+    bb31_t p1110111111111111111111111111111 =
+        p1110000111100001111000011110000 * p111000011110000111100001111;
+
+    return p1110111111111111111111111111111;
+  }
+
+  inline bool operator==(const bb31_t rhs) const { return val == rhs.val; }
+
+  inline bb31_t &operator^=(int b) { 
+      bb31_t sqr = *this;
+      if ((b & 1) == 0)
+          *this = one();
+      while (b >>= 1) {
+          sqr = sqr.square();
+          if (b & 1)
+              *this *= sqr;
+      }
+      return *this;
+  }
+
+  friend bb31_t operator^(bb31_t a, uint32_t b) { return a ^= b; }
+
+  inline bb31_t& sqr() { return *this; }
+
+  inline void set_to_zero() { val = 0; }
+
+  inline bool is_zero() const { return val == 0; }
+};
+
+#endif  // __CUDA__ARCH__
\ No newline at end of file
diff --git a/crates/core/machine/include/bitwise.hpp b/crates/core/machine/include/bitwise.hpp
new file mode 100644
index 0000000000..190e5f287c
--- /dev/null
+++ b/crates/core/machine/include/bitwise.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "prelude.hpp"
+#include "utils.hpp"
+
+namespace sp1_core_machine_sys::bitwise {
+template<class F>
+__SP1_HOSTDEV__ void event_to_row(const AluEvent& event, BitwiseCols<F>& cols) {
+    cols.shard = F::from_canonical_u32(event.shard);
+    write_word_from_u32<F>(cols.a, event.a);
+    write_word_from_u32<F>(cols.b, event.b);
+    write_word_from_u32<F>(cols.c, event.c);
+    cols.is_xor = F::from_bool(event.opcode == Opcode::XOR);
+    cols.is_or = F::from_bool(event.opcode == Opcode::OR);
+    cols.is_and = F::from_bool(event.opcode == Opcode::AND);
+
+    // No byte lookup yet.
+}
+}  // namespace sp1::bitwise
diff --git a/crates/core/machine/include/cpu.hpp b/crates/core/machine/include/cpu.hpp
new file mode 100644
index 0000000000..41f78f9ef6
--- /dev/null
+++ b/crates/core/machine/include/cpu.hpp
@@ -0,0 +1,555 @@
+#pragma once
+
+#include <cassert>
+#include <cstdlib>
+
+#include "memory.hpp"
+#include "prelude.hpp"
+#include "utils.hpp"
+
+// namespace sp1_core_machine_sys::cpu {
+
+// template<class F>
+// __SP1_HOSTDEV__ void populate_shard_clk(const CpuEventFfi& event, CpuCols<decltype(F::val)>& cols) {
+//     // cols.shard = F::from_canonical_u32(event.shard).val;
+//     // cols.clk = F::from_canonical_u32(event.clk).val;
+
+//     // const uint16_t clk_16bit_limb = (uint16_t)event.clk;
+//     // const uint8_t clk_8bit_limb = (uint8_t)(event.clk >> 16);
+//     // cols.clk_16bit_limb = F::from_canonical_u16(clk_16bit_limb).val;
+//     // cols.clk_8bit_limb = F::from_canonical_u8(clk_8bit_limb).val;
+
+//     // blu_events.add_byte_lookup_event(ByteLookupEvent::new(
+//     //     event.shard,
+//     //     U16Range,
+//     //     event.shard as u16,
+//     //     0,
+//     //     0,
+//     //     0,
+//     // ));
+//     // blu_events.add_byte_lookup_event(ByteLookupEvent::new(
+//     //     event.shard,
+//     //     U16Range,
+//     //     clk_16bit_limb,
+//     //     0,
+//     //     0,
+//     //     0,
+//     // ));
+//     // blu_events.add_byte_lookup_event(ByteLookupEvent::new(
+//     //     event.shard,
+//     //     ByteOpcode::U8Range,
+//     //     0,
+//     //     0,
+//     //     0,
+//     //     clk_8bit_limb as u8,
+//     // ));
+// }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ void
+// // instruction_populate(InstructionCols<decltype(F::val)>& self, const Instruction& instruction) {
+// //     self.opcode = F::from_canonical_u32((uint32_t)instruction.opcode).val;
+// //     write_word_from_u32<F>(self.op_a, instruction.op_a);
+// //     write_word_from_u32<F>(self.op_b, instruction.op_b);
+// //     write_word_from_u32<F>(self.op_c, instruction.op_c);
+
+// //     self.op_a_0 = F::from_bool(instruction.op_a == 0).val;  // 0 = Register::X0
+// // }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ void
+// // selectors_populate(OpcodeSelectorCols<decltype(F::val)>& self, const Instruction& instruction) {
+// //     self.imm_b = F::from_bool(instruction.imm_b).val;
+// //     self.imm_c = F::from_bool(instruction.imm_c).val;
+
+// //     switch (instruction.opcode) {
+// //         // Corresponds to `instruction.is_alu_instruction()` in Rust.
+// //         case Opcode::ADD:
+// //         case Opcode::SUB:
+// //         case Opcode::XOR:
+// //         case Opcode::OR:
+// //         case Opcode::AND:
+// //         case Opcode::SLL:
+// //         case Opcode::SRL:
+// //         case Opcode::SRA:
+// //         case Opcode::SLT:
+// //         case Opcode::SLTU:
+// //         case Opcode::MUL:
+// //         case Opcode::MULH:
+// //         case Opcode::MULHU:
+// //         case Opcode::MULHSU:
+// //         case Opcode::DIV:
+// //         case Opcode::DIVU:
+// //         case Opcode::REM:
+// //         case Opcode::REMU:
+// //             self.is_alu = F::one().val;
+// //             break;
+// //         // Corresponds to `instruction.is_ecall_instruction()` in Rust.
+// //         case Opcode::ECALL:
+// //             self.is_ecall = F::one().val;
+// //             break;
+// //         // Cleaner version of the `instruction.is_memory_instruction()` branch from Rust.
+// //         case Opcode::LB:
+// //             self.is_lb = F::one().val;
+// //             break;
+// //         case Opcode::LBU:
+// //             self.is_lbu = F::one().val;
+// //             break;
+// //         case Opcode::LHU:
+// //             self.is_lhu = F::one().val;
+// //             break;
+// //         case Opcode::LH:
+// //             self.is_lh = F::one().val;
+// //             break;
+// //         case Opcode::LW:
+// //             self.is_lw = F::one().val;
+// //             break;
+// //         case Opcode::SB:
+// //             self.is_sb = F::one().val;
+// //             break;
+// //         case Opcode::SH:
+// //             self.is_sh = F::one().val;
+// //             break;
+// //         case Opcode::SW:
+// //             self.is_sw = F::one().val;
+// //             break;
+// //         // Cleaner version of the `instruction.is_branch_instruction()` branch from Rust.
+// //         case Opcode::BEQ:
+// //             self.is_beq = F::one().val;
+// //             break;
+// //         case Opcode::BNE:
+// //             self.is_bne = F::one().val;
+// //             break;
+// //         case Opcode::BLT:
+// //             self.is_blt = F::one().val;
+// //             break;
+// //         case Opcode::BGE:
+// //             self.is_bge = F::one().val;
+// //             break;
+// //         case Opcode::BLTU:
+// //             self.is_bltu = F::one().val;
+// //             break;
+// //         case Opcode::BGEU:
+// //             self.is_bgeu = F::one().val;
+// //             break;
+// //         // Opcodes which each have their own branch in the original Rust function.
+// //         case Opcode::JAL:
+// //             self.is_jal = F::one().val;
+// //             break;
+// //         case Opcode::JALR:
+// //             self.is_jalr = F::one().val;
+// //             break;
+// //         case Opcode::AUIPC:
+// //             self.is_auipc = F::one().val;
+// //             break;
+// //         case Opcode::UNIMP:
+// //             self.is_unimpl = F::one().val;
+// //             break;
+// //         default:
+// //             break;
+// //     }
+// // }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ void
+// // babybear_word_populate(BabyBearWordRangeChecker<decltype(F::val)>& self, uint32_t value) {
+// //     for (uintptr_t i = 0; i < BYTE_SIZE; ++i) {
+// //         self.most_sig_byte_decomp[i] = F::from_bool((value & (1 << (i + 24))) != 0).val;
+// //     }
+// //     self.and_most_sig_byte_decomp_3_to_5 =
+// //         F::from_bool(self.most_sig_byte_decomp[3] != 0 && self.most_sig_byte_decomp[4] != 0).val;
+// //     self.and_most_sig_byte_decomp_3_to_6 =
+// //         F::from_bool(self.and_most_sig_byte_decomp_3_to_5 != 0 && self.most_sig_byte_decomp[5] != 0)
+// //             .val;
+// //     self.and_most_sig_byte_decomp_3_to_7 =
+// //         F::from_bool(self.and_most_sig_byte_decomp_3_to_6 != 0 && self.most_sig_byte_decomp[6] != 0)
+// //             .val;
+// // }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ void populate_memory(CpuCols<decltype(F::val)>& cols, const CpuEventFfi& event) {
+// //     // Populate addr_word and addr_aligned columns.
+// //     MemoryColumns<decltype(F::val)>& memory_columns = cols.opcode_specific_columns.memory;
+// //     // Wraps because the types involved are unsigned integers.
+// //     const uint32_t memory_addr = event.b + event.c;
+// //     const uint32_t aligned_addr = memory_addr - (memory_addr % (uint32_t)WORD_SIZE);
+// //     write_word_from_u32<F>(memory_columns.addr_word, memory_addr);
+// //     babybear_word_populate<F>(memory_columns.addr_word_range_checker, memory_addr);
+// //     memory_columns.addr_aligned = F::from_canonical_u32(aligned_addr).val;
+
+// //     // Populate the aa_least_sig_byte_decomp columns.
+// //     // assert(aligned_addr % 4 == 0);
+// //     const uint8_t aligned_addr_ls_byte = (uint8_t)aligned_addr;
+// //     for (uintptr_t i = 0; i < 6; ++i) {
+// //         memory_columns.aa_least_sig_byte_decomp[i] =
+// //             F::from_bool((aligned_addr_ls_byte & (1 << (i + 2))) != 0).val;
+// //     }
+// //     memory_columns.addr_word_nonce = F::from_canonical_u32(event.memory_add_nonce).val;
+
+// //     // // Populate memory offsets.
+// //     const uint8_t addr_offset = (uint8_t)(memory_addr % (uint32_t)WORD_SIZE);
+// //     memory_columns.addr_offset = F::from_canonical_u8(addr_offset).val;
+// //     memory_columns.offset_is_one = F::from_bool(addr_offset == 1).val;
+// //     memory_columns.offset_is_two = F::from_bool(addr_offset == 2).val;
+// //     memory_columns.offset_is_three = F::from_bool(addr_offset == 3).val;
+
+// //     // If it is a load instruction, set the unsigned_mem_val column.
+// //     const uint32_t mem_value = memory::unwrap_value(event.memory_record);
+
+// //     // // Add event to byte lookup for byte range checking each byte in the memory addr
+// //     // let addr_bytes = memory_addr.to_le_bytes();
+// //     // for byte_pair in addr_bytes.chunks_exact(2) {
+// //     //     blu_events.add_byte_lookup_event(ByteLookupEvent {
+// //     //         shard: event.shard,
+// //     //         opcode: ByteOpcode::U8Range,
+// //     //         a1: 0,
+// //     //         a2: 0,
+// //     //         b: byte_pair[0],
+// //     //         c: byte_pair[1],
+// //     //     });
+// //     // }
+
+// //     uint32_t unsigned_mem_val = mem_value;
+// //     switch (event.instruction.opcode) {
+// //         case Opcode::LB:
+// //         case Opcode::LBU:
+// //             unsigned_mem_val = (uint32_t)(uint8_t)(mem_value >> 8 * addr_offset);
+// //             break;
+// //         case Opcode::LH:
+// //         case Opcode::LHU:
+// //             unsigned_mem_val = ((addr_offset >> 1) & 0x1) == 0 ? (mem_value & 0x0000FFFF)
+// //                                                                : (mem_value & 0xFFFF0000) >> 16;
+// //             break;
+// //         case Opcode::LW:
+// //             // The value assigned at declaration is correct.
+// //             break;
+// //         default:
+// //             return;
+// //     }
+// //     // Guard above ensures instruction is a load.
+// //     write_word_from_u32<F>(cols.unsigned_mem_val, unsigned_mem_val);
+
+// //     uint8_t most_sig_mem_value_byte;
+// //     switch (event.instruction.opcode) {
+// //         case Opcode::LB:
+
+// //             most_sig_mem_value_byte = (uint8_t)unsigned_mem_val;
+// //             break;
+// //         case Opcode::LH:
+// //             most_sig_mem_value_byte = (uint8_t)(unsigned_mem_val >> 8);
+// //             break;
+// //         default:
+// //             // The load instruction is unsigned.
+// //             // Set the `mem_value_is_pos_not_x0` composite flag.
+// //             cols.mem_value_is_pos_not_x0 =
+// //                 F::from_bool(event.instruction.op_a != 0).val;  // 0 = Register::X0
+// //             return;
+// //     }
+// //     // Guard above ensures the load instruction is signed.
+// //     for (intptr_t i = BYTE_SIZE - 1; i >= 0; --i) {
+// //         memory_columns.most_sig_byte_decomp[i] =
+// //             F::from_canonical_u32(most_sig_mem_value_byte >> i & 0x1).val;
+// //     }
+// //     bool mem_value_is_pos_not_x0 = memory_columns.most_sig_byte_decomp[7] == F::zero().val;
+// //     if (!mem_value_is_pos_not_x0) {
+// //         cols.mem_value_is_neg_not_x0 =
+// //             F::from_bool(event.instruction.op_a != 0).val;  // 0 = Register::X0
+// //         cols.unsigned_mem_val_nonce = F::from_canonical_u32(event.memory_sub_nonce).val;
+// //     }
+// //     // Set the `mem_value_is_pos_not_x0` composite flag.
+// //     cols.mem_value_is_pos_not_x0 = F::from_bool(mem_value_is_pos_not_x0).val;
+// // }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ void populate_branch(CpuCols<decltype(F::val)>& cols, const CpuEventFfi& event) {
+// //     // let branch_columns = cols.opcode_specific_columns.branch_mut();
+// //     BranchCols<decltype(F::val)>& branch_columns = cols.opcode_specific_columns.branch;
+
+// //     Opcode opcode = event.instruction.opcode;
+// //     const bool use_signed_comparison = opcode == Opcode::BLT || opcode == Opcode::BGE;
+
+// //     const bool a_eq_b = event.a == event.b;
+// //     const bool a_lt_b =
+// //         use_signed_comparison ? ((int32_t)event.a < (int32_t)event.b) : (event.a < event.b);
+// //     const bool a_gt_b =
+// //         use_signed_comparison ? ((int32_t)event.a > (int32_t)event.b) : (event.a > event.b);
+
+// //     branch_columns.a_lt_b_nonce = F::from_canonical_u32(event.branch_lt_nonce).val;
+// //     branch_columns.a_gt_b_nonce = F::from_canonical_u32(event.branch_gt_nonce).val;
+
+// //     branch_columns.a_eq_b = F::from_bool(a_eq_b).val;
+// //     branch_columns.a_lt_b = F::from_bool(a_lt_b).val;
+// //     branch_columns.a_gt_b = F::from_bool(a_gt_b).val;
+
+// //     bool branching;
+// //     switch (opcode) {
+// //         case Opcode::BEQ:
+// //             branching = a_eq_b;
+// //             break;
+// //         case Opcode::BNE:
+// //             branching = !a_eq_b;
+// //             break;
+// //         case Opcode::BLT:
+// //         case Opcode::BLTU:
+// //             branching = a_lt_b;
+// //             break;
+// //         case Opcode::BGE:
+// //         case Opcode::BGEU:
+// //             branching = a_eq_b || a_gt_b;
+// //             break;
+// //         default:
+// //             // Precondition violated.
+// //             assert(false);
+// //             break;
+// //     }
+
+// //     // Unsigned arithmetic wraps.
+// //     const uint32_t next_pc = event.pc + event.c;
+// //     write_word_from_u32<F>(branch_columns.pc, event.pc);
+// //     write_word_from_u32<F>(branch_columns.next_pc, next_pc);
+// //     babybear_word_populate<F>(branch_columns.pc_range_checker, event.pc);
+// //     babybear_word_populate<F>(branch_columns.next_pc_range_checker, next_pc);
+
+// //     if (branching) {
+// //         cols.branching = F::one().val;
+// //         branch_columns.next_pc_nonce = F::from_canonical_u32(event.branch_add_nonce).val;
+// //     } else {
+// //         cols.not_branching = F::one().val;
+// //     }
+// // }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ void populate_jump(CpuCols<decltype(F::val)>& cols, const CpuEventFfi& event) {
+// //     // let jump_columns = cols.opcode_specific_columns.jump_mut();
+// //     JumpCols<decltype(F::val)>& jump_columns = cols.opcode_specific_columns.jump;
+
+// //     switch (event.instruction.opcode) {
+// //         case Opcode::JAL: {
+// //             // Unsigned arithmetic wraps.
+// //             uint32_t next_pc = event.pc + event.b;
+// //             babybear_word_populate<F>(jump_columns.op_a_range_checker, event.a);
+// //             write_word_from_u32<F>(jump_columns.pc, event.pc);
+// //             babybear_word_populate<F>(jump_columns.pc_range_checker, event.pc);
+// //             write_word_from_u32<F>(jump_columns.next_pc, next_pc);
+// //             babybear_word_populate<F>(jump_columns.next_pc_range_checker, next_pc);
+// //             jump_columns.jal_nonce = F::from_canonical_u32(event.jump_jal_nonce).val;
+// //             break;
+// //         }
+// //         case Opcode::JALR: {
+// //             // Unsigned arithmetic wraps.
+// //             uint32_t next_pc = event.b + event.c;
+// //             babybear_word_populate<F>(jump_columns.op_a_range_checker, event.a);
+// //             write_word_from_u32<F>(jump_columns.next_pc, next_pc);
+// //             babybear_word_populate<F>(jump_columns.next_pc_range_checker, next_pc);
+// //             jump_columns.jalr_nonce = F::from_canonical_u32(event.jump_jalr_nonce).val;
+// //             break;
+// //         }
+// //         default:
+// //             // Precondition violated.
+// //             assert(false);
+// //             break;
+// //     }
+// // }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ void populate_auipc(CpuCols<decltype(F::val)>& cols, const CpuEventFfi& event) {
+// //     AuipcCols<decltype(F::val)>& auipc_columns = cols.opcode_specific_columns.auipc;
+
+// //     write_word_from_u32<F>(auipc_columns.pc, event.pc);
+// //     babybear_word_populate<F>(auipc_columns.pc_range_checker, event.pc);
+// //     auipc_columns.auipc_nonce = F::from_canonical_u32(event.auipc_nonce).val;
+// // }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ void
+// // is_zero_operation_populate_from_field_element(IsZeroOperation<decltype(F::val)>& self, F a) {
+// //     if (a == F::zero()) {
+// //         self.inverse = F::zero().val;
+// //         self.result = F::one().val;
+// //     } else {
+// //         self.inverse = a.reciprocal().val;
+// //         self.result = F::zero().val;
+// //     }
+// //     // F is_zero = F::one() - F(self.inverse) * a;
+// //     // assert(is_zero == F(self.result));
+// //     // let is_zero = one.clone() - cols.inverse * a.clone();
+// //     // builder.when(is_real.clone()).assert_eq(is_zero, cols.result);
+
+// //     // let prod = self.inverse * a;
+// //     // debug_assert!(prod == F::one() || prod == F::zero());
+// //     // (a == F::zero()) as u32
+// // }
+
+// // template<class F>
+// // __SP1_HOSTDEV__ bool populate_ecall(CpuCols<decltype(F::val)>& cols, const CpuEventFfi& event) {
+// //     bool is_halt = false;
+
+// //     // The send_to_table column is the 1st entry of the op_a_access column prev_value field.
+// //     // Look at `ecall_eval` in cpu/air/mod.rs for the corresponding constraint and
+// //     // explanation.
+// //     EcallCols<decltype(F::val)>& ecall_cols = cols.opcode_specific_columns.ecall;
+
+// //     cols.ecall_mul_send_to_table = cols.op_a_access.prev_value._0[1];
+
+// //     F syscall_id = F(cols.op_a_access.prev_value._0[0]);
+
+// //     // In the following statements, truncating to `uint8_t` is the equivalent of the
+// //     // `SyscallCode::get_syscall_id` calls from the Rust code.
+
+// //     // Populate `is_enter_unconstrained`.
+// //     is_zero_operation_populate_from_field_element(
+// //         ecall_cols.is_enter_unconstrained,
+// //         syscall_id - F::from_canonical_u8((uint8_t)SyscallCode::ENTER_UNCONSTRAINED)
+// //     );
+
+// //     // Populate `is_hint_len`.
+// //     is_zero_operation_populate_from_field_element(
+// //         ecall_cols.is_hint_len,
+// //         syscall_id - F::from_canonical_u8((uint8_t)SyscallCode::HINT_LEN)
+// //     );
+
+// //     // Populate `is_halt`.
+// //     is_zero_operation_populate_from_field_element(
+// //         ecall_cols.is_halt,
+// //         syscall_id - F::from_canonical_u8((uint8_t)SyscallCode::HALT)
+// //     );
+
+// //     // Populate `is_commit`.
+// //     is_zero_operation_populate_from_field_element(
+// //         ecall_cols.is_commit,
+// //         syscall_id - F::from_canonical_u8((uint8_t)SyscallCode::COMMIT)
+// //     );
+
+// //     // Populate `is_commit_deferred_proofs`.
+// //     is_zero_operation_populate_from_field_element(
+// //         ecall_cols.is_commit_deferred_proofs,
+// //         syscall_id - F::from_canonical_u8((uint8_t)SyscallCode::COMMIT_DEFERRED_PROOFS)
+// //     );
+
+// //     // If the syscall is `COMMIT` or `COMMIT_DEFERRED_PROOFS`, set the index bitmap and
+// //     // digest word.
+// //     if (syscall_id
+// //             == F::from_canonical_u8((uint8_t)SyscallCode::COMMIT
+// //             )  // Comment to make my editor format nicely...
+// //         || syscall_id == F::from_canonical_u8((uint8_t)SyscallCode::COMMIT_DEFERRED_PROOFS)) {
+// //         uint32_t digest_idx = word_to_u32<F>(cols.op_b_access.access.value);
+// //         ecall_cols.index_bitmap[digest_idx] = F::one().val;
+// //     }
+
+// //     // Write the syscall nonce.
+// //     ecall_cols.syscall_nonce = F::from_canonical_u32(event.syscall_nonce).val;
+
+// //     is_halt = syscall_id == F::from_canonical_u32((uint8_t)SyscallCode::HALT);
+
+// //     // For halt and commit deferred proofs syscalls, we need to baby bear range check one of
+// //     // it's operands.
+// //     if (is_halt) {
+// //         write_word_from_u32<F>(ecall_cols.operand_to_check, event.b);
+// //         babybear_word_populate<F>(ecall_cols.operand_range_check_cols, event.b);
+// //         cols.ecall_range_check_operand = F::one().val;
+// //     }
+
+// //     if (syscall_id == F::from_canonical_u32((uint8_t)SyscallCode::COMMIT_DEFERRED_PROOFS)) {
+// //         write_word_from_u32<F>(ecall_cols.operand_to_check, event.c);
+// //         babybear_word_populate<F>(ecall_cols.operand_range_check_cols, event.c);
+// //         cols.ecall_range_check_operand = F::one().val;
+// //     }
+
+// //     return is_halt;
+// // }
+
+// template<class F>
+// __SP1_HOSTDEV__ void event_to_row(const CpuEventFfi& event, CpuCols<decltype(F::val)>& cols) {
+//     // // Populate shard and clk columns.
+//     // populate_shard_clk<F>(event, cols);
+
+//     // // Populate the nonce.
+//     // cols.nonce = F::from_canonical_u32(event.alu_nonce).val;
+
+//     // // Populate basic fields.
+//     // cols.pc = F::from_canonical_u32(event.pc).val;
+//     // cols.next_pc = F::from_canonical_u32(event.next_pc).val;
+//     // instruction_populate<F>(cols.instruction, event.instruction);
+//     // // cols.instruction.populate(event.instruction);
+//     // selectors_populate<F>(cols.selectors, event.instruction);
+//     // // cols.selectors.populate(event.instruction);
+//     // write_word_from_u32<F>(cols.op_a_access.access.value, event.a);
+//     // write_word_from_u32<F>(cols.op_b_access.access.value, event.b);
+//     // write_word_from_u32<F>(cols.op_c_access.access.value, event.c);
+
+//     // // // Populate memory accesses for a, b, and c.
+//     // // The function guards against the record being `None`.
+//     // memory::populate_read_write<F>(cols.op_a_access, event.a_record);
+//     // if (event.b_record.tag == OptionMemoryRecordEnum::Tag::Read) {
+//     //     memory::populate_read<F>(cols.op_b_access, event.b_record.read._0);
+//     // }
+//     // if (event.c_record.tag == OptionMemoryRecordEnum::Tag::Read) {
+//     //     memory::populate_read<F>(cols.op_c_access, event.c_record.read._0);
+//     // }
+
+//     // // // Populate range checks for a.
+//     // // let a_bytes = cols
+//     // //     .op_a_access
+//     // //     .access
+//     // //     .val
+//     // //     .0
+//     // //     .iter()
+//     // //     .map(|x| x.as_canonical_u32())
+//     // //     .collect::<Vec<_>>();
+//     // // blu_events.add_byte_lookup_event(ByteLookupEvent {
+//     // //     shard: event.shard,
+//     // //     opcode: ByteOpcode::U8Range,
+//     // //     a1: 0,
+//     // //     a2: 0,
+//     // //     b: a_bytes[0] as u8,
+//     // //     c: a_bytes[1] as u8,
+//     // // });
+//     // // blu_events.add_byte_lookup_event(ByteLookupEvent {
+//     // //     shard: event.shard,
+//     // //     opcode: ByteOpcode::U8Range,
+//     // //     a1: 0,
+//     // //     a2: 0,
+//     // //     b: a_bytes[2] as u8,
+//     // //     c: a_bytes[3] as u8,
+//     // // });
+
+//     // // Populate memory accesses for reading from memory.
+//     // // `event.memory` appears to be vestigial.
+//     // // assert_eq!(event.memory_record.is_some(), event.memory.is_some());
+//     // // The function guards against the record being `None`.
+//     // memory::populate_read_write<F>(
+//     //     cols.opcode_specific_columns.memory.memory_access,
+//     //     event.memory_record
+//     // );
+
+//     // // Populate memory, branch, jump, and auipc specific fields.
+//     // const bool is_memory = opcode_utils::is_memory(event.instruction.opcode);
+//     // const bool is_branch = opcode_utils::is_branch(event.instruction.opcode);
+//     // const bool is_jump = opcode_utils::is_jump(event.instruction.opcode);
+//     // const bool is_auipc = event.instruction.opcode == Opcode::AUIPC;
+//     // const bool is_ecall = event.instruction.opcode == Opcode::ECALL;
+//     // // Calculated by `populate_ecall`, if called.
+//     // bool is_halt = false;
+//     // // Unlike the Rust code, we guard outside the function bodies so we can reuse the booleans.
+//     // if (is_memory) {
+//     //     populate_memory<F>(cols, event);
+//     // }
+//     // if (is_branch) {
+//     //     populate_branch<F>(cols, event);
+//     // }
+//     // if (is_jump) {
+//     //     populate_jump<F>(cols, event);
+//     // }
+//     // if (is_auipc) {
+//     //     populate_auipc<F>(cols, event);
+//     // }
+//     // if (is_ecall) {
+//     //     is_halt = populate_ecall<F>(cols, event);
+//     // }
+
+//     // cols.is_sequential_instr = F::from_bool(!(is_branch || is_jump || is_halt)).val;
+
+//     // // Assert that the instruction is not a no-op.
+//     // cols.is_real = F::one().val;
+// }
+// }  // namespace sp1::cpu
\ No newline at end of file
diff --git a/crates/core/machine/include/lt.hpp b/crates/core/machine/include/lt.hpp
new file mode 100644
index 0000000000..3c83c144f4
--- /dev/null
+++ b/crates/core/machine/include/lt.hpp
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <cstdlib>
+
+#include "prelude.hpp"
+#include "utils.hpp"
+
+namespace sp1_core_machine_sys::lt {
+template<class F>
+__SP1_HOSTDEV__ void event_to_row(const AluEvent& event, LtCols<F>& cols) {
+    array_t<uint8_t, 4> a = u32_to_le_bytes(event.a);
+    array_t<uint8_t, 4> b = u32_to_le_bytes(event.b);
+    array_t<uint8_t, 4> c = u32_to_le_bytes(event.c);
+    cols.shard = F::from_canonical_u32(event.shard).val;
+    word_from_le_bytes<F>(cols.a, a);
+    word_from_le_bytes<F>(cols.b, b);
+    word_from_le_bytes<F>(cols.c, c);
+
+    // If this is SLT, mask the MSB of b & c before computing cols.bits.
+    uint8_t masked_b = b[3] & 0x7f;
+    uint8_t masked_c = c[3] & 0x7f;
+    cols.b_masked = F::from_canonical_u8(masked_b);
+    cols.c_masked = F::from_canonical_u8(masked_c);
+
+    // // Send the masked interaction.
+    // blu.add_byte_lookup_event(ByteLookupEvent {
+    //     shard: event.shard,
+    //     channel: event.channel,
+    //     opcode: ByteOpcode::AND,
+    //     a1: masked_b as u16,
+    //     a2: 0,
+    //     b: b[3],
+    //     c: 0x7f,
+    // });
+    // blu.add_byte_lookup_event(ByteLookupEvent {
+    //     shard: event.shard,
+    //     channel: event.channel,
+    //     opcode: ByteOpcode::AND,
+    //     a1: masked_c as u16,
+    //     a2: 0,
+    //     b: c[3],
+    //     c: 0x7f,
+    // });
+
+    array_t<uint8_t, 4> b_comp = b;
+    array_t<uint8_t, 4> c_comp = c;
+    if (event.opcode == Opcode::SLT) {
+        b_comp[3] = masked_b;
+        c_comp[3] = masked_c;
+    }
+
+    // Set the byte equality flags.
+    intptr_t i = 3;
+    while (true) {
+        uint8_t b_byte = b_comp[i];
+        uint8_t c_byte = c_comp[i];
+        if (b_byte != c_byte) {
+            cols.byte_flags[i] = F::one();
+            cols.sltu = F::from_bool(b_byte < c_byte);
+            F b_byte_f = F::from_canonical_u8(b_byte);
+            F c_byte_f = F::from_canonical_u8(c_byte);
+            cols.not_eq_inv = (b_byte_f - c_byte_f).reciprocal();
+            cols.comparison_bytes[0] = b_byte_f;
+            cols.comparison_bytes[1] = c_byte_f;
+            break;
+        }
+        if (i == 0) {
+            // The equality `b_comp == c_comp` holds.
+            cols.is_comp_eq = F::one();
+            break;
+        }
+        --i;
+    }
+
+    cols.msb_b = F::from_bool((b[3] >> 7) & 1);
+    cols.msb_c = F::from_bool((c[3] >> 7) & 1);
+    cols.is_sign_eq = F::from_bool(event.opcode != Opcode::SLT || cols.msb_b == cols.msb_c);
+
+    cols.is_slt = F::from_bool(event.opcode == Opcode::SLT);
+    cols.is_sltu = F::from_bool(event.opcode == Opcode::SLTU);
+
+    cols.bit_b = (F(cols.msb_b) * F(cols.is_slt));
+    cols.bit_c = (F(cols.msb_c) * F(cols.is_slt));
+
+    // if (F(cols.a._0[0]) != F(cols.bit_b) * (F::one() - F(cols.bit_c)) + F(cols.is_sign_eq) * F(cols.sltu))
+    // {
+    //     std::exit(1);
+    // }
+
+    // blu.add_byte_lookup_event(ByteLookupEvent {
+    //     shard: event.shard,
+    //     channel: event.channel,
+    //     opcode: ByteOpcode::LTU,
+    //     a1: cols.sltu.as_canonical_u32() as u16,
+    //     a2: 0,
+    //     b: cols.comparison_bytes[0].as_canonical_u32() as u8,
+    //     c: cols.comparison_bytes[1].as_canonical_u32() as u8,
+    // });
+}
+}  // namespace sp1::lt
\ No newline at end of file
diff --git a/crates/core/machine/include/memory.hpp b/crates/core/machine/include/memory.hpp
new file mode 100644
index 0000000000..216a3a449d
--- /dev/null
+++ b/crates/core/machine/include/memory.hpp
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <cstdlib>
+
+#include "prelude.hpp"
+#include "utils.hpp"
+
+// namespace sp1_core_machine_sys::memory {
+// __SP1_HOSTDEV__ __SP1_INLINE__ uint32_t unwrap_value(const OptionMemoryRecordEnum& record) {
+//     switch (record.tag) {
+//         case OptionMemoryRecordEnum::Tag::Read:
+//             return record.read._0.value;
+//         case OptionMemoryRecordEnum::Tag::Write:
+//             return record.write._0.value;
+//         default:
+//             // Either the tag is `None` or it is an invalid value.
+//             assert(false);
+//     }
+//     // Unreachable.
+//     return 0;
+// }
+
+// template<class F>
+// __SP1_HOSTDEV__ void populate_access(
+//     MemoryAccessCols<decltype(F::val)>& self,
+//     const MemoryRecord& current_record,
+//     const MemoryRecord& prev_record
+// ) {
+//     write_word_from_u32<F>(self.value, current_record.value);
+
+//     self.prev_shard = F::from_canonical_u32(prev_record.shard).val;
+//     self.prev_clk = F::from_canonical_u32(prev_record.timestamp).val;
+
+//     // Fill columns used for verifying current memory access time value is greater than
+//     // previous's.
+//     const bool use_clk_comparison = prev_record.shard == current_record.shard;
+//     self.compare_clk = F::from_bool(use_clk_comparison).val;
+//     const uint32_t prev_time_value = use_clk_comparison ? prev_record.timestamp : prev_record.shard;
+//     const uint32_t current_time_value =
+//         use_clk_comparison ? current_record.timestamp : current_record.shard;
+
+//     const uint32_t diff_minus_one = current_time_value - prev_time_value - 1;
+//     const uint16_t diff_16bit_limb = (uint16_t)(diff_minus_one);
+//     self.diff_16bit_limb = F::from_canonical_u16(diff_16bit_limb).val;
+//     const uint8_t diff_8bit_limb = (uint8_t)(diff_minus_one >> 16);
+//     self.diff_8bit_limb = F::from_canonical_u8(diff_8bit_limb).val;
+
+//     // let shard = current_record.shard;
+
+//     // // Add a byte table lookup with the 16Range op.
+//     // output.add_u16_range_check(shard, diff_16bit_limb);
+
+//     // // Add a byte table lookup with the U8Range op.
+//     // output.add_u8_range_check(shard, 0, diff_8bit_limb as u8);
+// }
+
+// template<class F>
+// __SP1_HOSTDEV__ void
+// populate_read(MemoryReadCols<F>& self, const MemoryReadRecord& record) {
+//     const MemoryRecord current_record = {
+//         .shard = record.shard,
+//         .timestamp = record.timestamp,
+//         .value = record.value,
+//     };
+//     const MemoryRecord prev_record = {
+//         .shard = record.prev_shard,
+//         .timestamp = record.prev_timestamp,
+//         .value = record.value,
+//     };
+//     populate_access<F>(self.access, current_record, prev_record);
+// }
+
+// template<class F>
+// __SP1_HOSTDEV__ void populate_read_write(
+//     MemoryReadWriteCols<decltype(F::val)>& self,
+//     const OptionMemoryRecordEnum& record
+// ) {
+//     if (record.tag == OptionMemoryRecordEnum::Tag::None) {
+//         return;
+//     }
+//     MemoryRecord current_record;
+//     MemoryRecord prev_record;
+//     switch (record.tag) {
+//         case OptionMemoryRecordEnum::Tag::Read:
+//             current_record = {
+//                 .shard = record.read._0.shard,
+//                 .timestamp = record.read._0.timestamp,
+//                 .value = record.read._0.value,
+//             };
+//             prev_record = {
+//                 .shard = record.read._0.prev_shard,
+//                 .timestamp = record.read._0.prev_timestamp,
+//                 .value = record.read._0.value,
+//             };
+//             break;
+//         case OptionMemoryRecordEnum::Tag::Write:
+//             current_record = {
+//                 .shard = record.write._0.shard,
+//                 .timestamp = record.write._0.timestamp,
+//                 .value = record.write._0.value,
+//             };
+//             prev_record = {
+//                 .shard = record.write._0.prev_shard,
+//                 .timestamp = record.write._0.prev_timestamp,
+//                 .value = record.write._0.prev_value,
+//             };
+//             break;
+//         default:
+//             // Unreachable. `None` case guarded above.
+//             assert(false);
+//             break;
+//     }
+//     write_word_from_u32<F>(self.prev_value, prev_record.value);
+//     populate_access<F>(self.access, current_record, prev_record);
+// }
+// }  // namespace sp1::memory
\ No newline at end of file
diff --git a/crates/core/machine/include/memory_global.hpp b/crates/core/machine/include/memory_global.hpp
new file mode 100644
index 0000000000..6859686a56
--- /dev/null
+++ b/crates/core/machine/include/memory_global.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "prelude.hpp"
+#include "utils.hpp"
+#include "bb31_septic_extension_t.hpp"
+#include "memory_local.hpp"
+
+namespace sp1_core_machine_sys::memory_global {
+    template<class F, class EF7>
+    __SP1_HOSTDEV__ void event_to_row(const MemoryInitializeFinalizeEvent* event, const bool is_receive, MemoryInitCols<F>* cols) {
+        MemoryRecord record;
+        if (is_receive) {
+            record.shard = event->shard;
+            record.timestamp = event->timestamp;
+            record.value = event->value;
+        } else {
+            record.shard = 0;
+            record.timestamp = 0;
+            record.value = event->value;
+        }
+        // We populate only the things in the first loop of generate_trace here. The second loop is handled in the kernel directly.
+        sp1_core_machine_sys::memory_local::populate_memory<F, EF7>(&cols->global_interaction_cols, &record, event->addr, is_receive);
+        cols->addr = F::from_canonical_u32(event->addr);
+        for(uintptr_t i = 0 ; i < 32 ; i++) {
+            cols->addr_bits.bits[i] = F::from_canonical_u32(((event->addr) >> i) & 1);
+        }
+        cols->addr_bits.and_most_sig_byte_decomp_3_to_5 = cols->addr_bits.bits[27] * cols->addr_bits.bits[28];
+        cols->addr_bits.and_most_sig_byte_decomp_3_to_6 = cols->addr_bits.and_most_sig_byte_decomp_3_to_5 * cols->addr_bits.bits[29];
+        cols->addr_bits.and_most_sig_byte_decomp_3_to_7 = cols->addr_bits.and_most_sig_byte_decomp_3_to_6 * cols->addr_bits.bits[30];
+        cols->shard = F::from_canonical_u32(event->shard);
+        cols->timestamp = F::from_canonical_u32(event->timestamp);
+        for(uintptr_t i = 0 ; i < 32 ; i++) {
+            cols->value[i] = F::from_canonical_u32(((event->value) >> i) & 1);
+        }
+        cols->is_real = F::from_canonical_u32(event->used);
+    }
+}  // namespace sp1::memory_local
\ No newline at end of file
diff --git a/crates/core/machine/include/memory_local.hpp b/crates/core/machine/include/memory_local.hpp
new file mode 100644
index 0000000000..1d799208b2
--- /dev/null
+++ b/crates/core/machine/include/memory_local.hpp
@@ -0,0 +1,82 @@
+#pragma once
+
+#include "prelude.hpp"
+#include "utils.hpp"
+#include "bb31_septic_extension_t.hpp"
+
+namespace sp1_core_machine_sys::memory_local {
+    template<class F, class EF7> __SP1_HOSTDEV__ void populate_memory(GlobalInteractionOperation<F>* cols, const MemoryRecord* record, const uint32_t& addr, bool is_receive) {
+        EF7 x_start;
+
+        {
+            x_start.value[0] = F::from_canonical_u32(record->shard + (1 << 24));
+            x_start.value[1] = F::from_canonical_u32(record->timestamp);
+            x_start.value[2] = F::from_canonical_u32(addr);
+            x_start.value[3] = F::from_canonical_u32(record->value & 255);
+            x_start.value[4] = F::from_canonical_u32((record->value >> 8) & 255);
+            x_start.value[5] = F::from_canonical_u32((record->value >> 16) & 255);
+            x_start.value[6] = F::from_canonical_u32((record->value >> 24) & 255);
+        }
+
+        #pragma unroll(1)
+        for(uint32_t offset = 0 ; offset < 256 ; offset++) {
+            EF7 x_trial = x_start.universal_hash();
+            EF7 y_sq = x_trial.curve_formula();
+            F y_sq_pow_r = y_sq.pow_r();
+            F is_square = y_sq_pow_r ^ 1006632960;
+            if(is_square == F::one()) {
+                EF7 y = y_sq.sqrt(y_sq_pow_r);
+                if (y.is_exception()) {
+                    x_start += F::from_canonical_u32(1 << 16);
+                    continue;
+                }
+                if (y.is_receive() != is_receive) {
+                    y = EF7::zero() - y;
+                }
+                // x_trial, y
+                for(uint32_t idx = 0 ; idx < 8 ; idx++ ) {
+                    cols->offset_bits[idx] = F::from_canonical_u32((offset >> idx) & 1);
+                }
+                for(uintptr_t i = 0 ; i < 7 ; i++) {
+                    cols->x_coordinate._0[i] = x_trial.value[i];
+                    cols->y_coordinate._0[i] = y.value[i];
+                }
+                uint32_t range_check_value;
+                if (is_receive) {
+                    range_check_value = y.value[6].as_canonical_u32() - 1;
+                } else {
+                    range_check_value = y.value[6].as_canonical_u32() - (F::MOD + 1) / 2;
+                }
+                F top_4_bits = F::zero();
+                for(uint32_t idx = 0 ; idx < 30 ; idx++) {
+                    cols->y6_bit_decomp[idx] = F::from_canonical_u32((range_check_value >> idx) & 1);
+                    if (idx >= 26) {
+                        top_4_bits += cols->y6_bit_decomp[idx];
+                    }
+                }
+                top_4_bits -= F::from_canonical_u32(4);
+                cols->range_check_witness = top_4_bits.reciprocal();
+                return;
+            }
+            x_start += F::from_canonical_u32(1 << 16);
+        }
+        assert(false);
+    }
+
+    template<class F, class EF7>
+    __SP1_HOSTDEV__ void event_to_row(const MemoryLocalEvent* event, SingleMemoryLocal<F>* cols) {
+        populate_memory<F, EF7>(&cols->initial_global_interaction_cols, &event->initial_mem_access, event->addr, true);
+        populate_memory<F, EF7>(&cols->final_global_interaction_cols, &event->final_mem_access, event->addr, false);
+        cols->addr = F::from_canonical_u32(event->addr);
+        
+        cols->initial_shard = F::from_canonical_u32(event->initial_mem_access.shard);
+        cols->initial_clk = F::from_canonical_u32(event->initial_mem_access.timestamp);
+        write_word_from_u32_v2<F>(cols->initial_value, event->initial_mem_access.value);
+        
+        cols->final_shard = F::from_canonical_u32(event->final_mem_access.shard);
+        cols->final_clk = F::from_canonical_u32(event->final_mem_access.timestamp);
+        write_word_from_u32_v2<F>(cols->final_value, event->final_mem_access.value);
+
+        cols->is_real = F::one();
+    }
+}  // namespace sp1::memory_local
diff --git a/crates/core/machine/include/mul.hpp b/crates/core/machine/include/mul.hpp
new file mode 100644
index 0000000000..efa564275f
--- /dev/null
+++ b/crates/core/machine/include/mul.hpp
@@ -0,0 +1,111 @@
+#pragma once
+
+#include "prelude.hpp"
+#include "utils.hpp"
+
+namespace sp1_core_machine_sys::mul {
+template<class F>
+__SP1_HOSTDEV__ void event_to_row(const AluEvent& event, MulCols<decltype(F::val)>& cols) {
+    // // Ensure that the opcode is MUL, MULHU, MULH, or MULHSU.
+    // assert!(
+    //     event.opcode == Opcode::MUL
+    //         || event.opcode == Opcode::MULHU
+    //         || event.opcode == Opcode::MULH
+    //         || event.opcode == Opcode::MULHSU
+    // );
+
+    const array_t<uint8_t, 4> a = u32_to_le_bytes(event.a);
+    const array_t<uint8_t, 4> b = u32_to_le_bytes(event.b);
+    const array_t<uint8_t, 4> c = u32_to_le_bytes(event.c);
+
+    // Handle b and c's signs.
+    {
+        uint8_t b_msb = get_msb(b);
+        cols.b_msb = F::from_canonical_u8(b_msb).val;
+        uint8_t c_msb = get_msb(c);
+        cols.c_msb = F::from_canonical_u8(c_msb).val;
+
+        // If b is signed and it is negative, sign extend b.
+        if ((event.opcode == Opcode::MULH || event.opcode == Opcode::MULHSU) && b_msb == 1) {
+            cols.b_sign_extend = F::one().val;
+        }
+
+        // If c is signed and it is negative, sign extend c.
+        if (event.opcode == Opcode::MULH && c_msb == 1) {
+            cols.c_sign_extend = F::one().val;
+        }
+
+        //     // Insert the MSB lookup events.
+        //     {
+        //         let words = [b_word, c_word];
+        //         let mut blu_events: Vec<ByteLookupEvent> = vec![];
+        //         for word in words.iter() {
+        //             let most_significant_byte = word[WORD_SIZE - 1];
+        //             blu_events.push(ByteLookupEvent {
+        //                 shard: event.shard,
+        //                 opcode: ByteOpcode::MSB,
+        //                 a1: get_msb(*word) as u16,
+        //                 a2: 0,
+        //                 b: most_significant_byte,
+        //                 c: 0,
+        //             });
+        //         }
+        //         record.add_byte_lookup_events(blu_events);
+        //     }
+    }
+
+    // Required for the following logic to correctly multiply.
+    static_assert(2 * WORD_SIZE == LONG_WORD_SIZE);
+
+    array_t<uint32_t, LONG_WORD_SIZE> product {};
+    for (uintptr_t i = 0; i < WORD_SIZE; ++i) {
+        for (uintptr_t j = 0; j < WORD_SIZE; ++j) {
+            product[i + j] += (uint32_t)b[i] * (uint32_t)c[j];
+        }
+        if (cols.c_sign_extend != F::zero().val) {
+            for (uintptr_t j = WORD_SIZE; j < LONG_WORD_SIZE - i; ++j) {
+                product[i + j] += (uint32_t)b[i] * (uint32_t)0xFF;
+            }
+        }
+    }
+    if (cols.b_sign_extend != F::zero().val) {
+        for (uintptr_t i = WORD_SIZE; i < LONG_WORD_SIZE; ++i) {
+            for (uintptr_t j = 0; j < LONG_WORD_SIZE - i; ++j) {
+                product[i + j] += (uint32_t)0xFF * (uint32_t)c[j];
+            }
+        }
+    }
+
+    // Calculate the correct product using the `product` array. We store the
+    // correct carry value for verification.
+    const uint32_t base = 1 << BYTE_SIZE;
+    array_t<uint32_t, LONG_WORD_SIZE> carry {};
+    for (uintptr_t i = 0; i < LONG_WORD_SIZE; ++i) {
+        carry[i] = product[i] / base;
+        product[i] %= base;
+        if (i + 1 < LONG_WORD_SIZE) {
+            product[i + 1] += carry[i];
+        }
+        cols.carry[i] = F::from_canonical_u32(carry[i]).val;
+    }
+
+    for (uintptr_t i = 0; i < LONG_WORD_SIZE; ++i) {
+        cols.product[i] = F::from_canonical_u32(product[i]).val;
+    }
+    word_from_le_bytes<F>(cols.a, a);
+    word_from_le_bytes<F>(cols.b, b);
+    word_from_le_bytes<F>(cols.c, c);
+    cols.is_real = F::one().val;
+    cols.is_mul = F::from_bool(event.opcode == Opcode::MUL).val;
+    cols.is_mulh = F::from_bool(event.opcode == Opcode::MULH).val;
+    cols.is_mulhu = F::from_bool(event.opcode == Opcode::MULHU).val;
+    cols.is_mulhsu = F::from_bool(event.opcode == Opcode::MULHSU).val;
+    cols.shard = F::from_canonical_u32(event.shard).val;
+
+    // // Range check.
+    // {
+    //     record.add_u16_range_checks(event.shard, &carry.map(|x| x as u16));
+    //     record.add_u8_range_checks(event.shard, &product.map(|x| x as u8));
+    // }
+}
+}  // namespace sp1::mul
diff --git a/crates/core/machine/include/prelude.hpp b/crates/core/machine/include/prelude.hpp
new file mode 100644
index 0000000000..1f2b0db1e1
--- /dev/null
+++ b/crates/core/machine/include/prelude.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "sp1-core-machine-sys-cbindgen.hpp"
+
+#ifndef __CUDACC__
+    #define __SP1_HOSTDEV__
+    #define __SP1_INLINE__ inline
+    #include <array>
+
+namespace sp1_core_machine_sys {
+template<class T, std::size_t N>
+using array_t = std::array<T, N>;
+}  // namespace sp1
+#else
+    #define __SP1_HOSTDEV__ __host__ __device__
+    #define __SP1_INLINE__ 
+    #include <cuda/std/array>
+
+namespace sp1_core_machine_sys {
+template<class T, std::size_t N>
+using array_t = cuda::std::array<T, N>;
+}  // namespace sp1
+#endif
diff --git a/crates/core/machine/include/sll.hpp b/crates/core/machine/include/sll.hpp
new file mode 100644
index 0000000000..5f24a575a0
--- /dev/null
+++ b/crates/core/machine/include/sll.hpp
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <cstdlib>
+
+#include "prelude.hpp"
+#include "utils.hpp"
+
+namespace sp1_core_machine_sys::sll {
+template<class F>
+__SP1_HOSTDEV__ void event_to_row(const AluEvent& event, ShiftLeftCols<decltype(F::val)>& cols) {
+    array_t<uint8_t, 4> a = u32_to_le_bytes(event.a);
+    array_t<uint8_t, 4> b = u32_to_le_bytes(event.b);
+    array_t<uint8_t, 4> c = u32_to_le_bytes(event.c);
+    cols.shard = F::from_canonical_u32(event.shard).val;
+    word_from_le_bytes<F>(cols.a, a);
+    word_from_le_bytes<F>(cols.b, b);
+    word_from_le_bytes<F>(cols.c, c);
+    cols.is_real = F::one().val;
+    for (uintptr_t i = 0; i < BYTE_SIZE; ++i) {
+        cols.c_least_sig_byte[i] = F::from_canonical_u32((event.c >> i) & 1).val;
+    }
+
+    // Variables for bit shifting.
+    uintptr_t num_bits_to_shift = event.c % BYTE_SIZE;
+    for (uintptr_t i = 0; i < BYTE_SIZE; ++i) {
+        cols.shift_by_n_bits[i] = F::from_bool(num_bits_to_shift == i).val;
+    }
+
+    uint32_t bit_shift_multiplier = 1 << num_bits_to_shift;
+    cols.bit_shift_multiplier = F::from_canonical_u32(bit_shift_multiplier).val;
+
+    uint32_t carry = 0;
+    uint32_t base = 1 << BYTE_SIZE;
+
+    array_t<uint8_t, WORD_SIZE> bit_shift_result;
+    array_t<uint8_t, WORD_SIZE> bit_shift_result_carry;
+    for (uintptr_t i = 0; i < WORD_SIZE; ++i) {
+        uint32_t v = b[i] * bit_shift_multiplier + carry;
+        carry = v / base;
+        bit_shift_result[i] = (uint8_t)(v % base);
+        cols.bit_shift_result[i] = F::from_canonical_u8(bit_shift_result[i]).val;
+        bit_shift_result_carry[i] = (uint8_t)carry;
+        cols.bit_shift_result_carry[i] = F::from_canonical_u8(bit_shift_result_carry[i]).val;
+    }
+
+    // // Variables for byte shifting.
+    uintptr_t num_bytes_to_shift = (uintptr_t)(event.c & 0b11111) / BYTE_SIZE;
+    for (uintptr_t i = 0; i < WORD_SIZE; ++i) {
+        cols.shift_by_n_bytes[i] = F::from_bool(num_bytes_to_shift == i).val;
+    }
+
+    // // Range checks.
+    // {
+    //     blu.add_u8_range_checks(event.shard, event.channel, &bit_shift_result);
+    //     blu.add_u8_range_checks(event.shard, event.channel, &bit_shift_result_carry);
+    // }
+
+    // // Sanity check.
+    // for i in num_bytes_to_shift..WORD_SIZE {
+    //     debug_assert_eq!(
+    //         cols.bit_shift_result[i - num_bytes_to_shift],
+    //         F::from_canonical_u8(a[i])
+    //     );
+    // }
+}
+}  // namespace sp1::sll
\ No newline at end of file
diff --git a/crates/core/machine/include/sr.hpp b/crates/core/machine/include/sr.hpp
new file mode 100644
index 0000000000..a5806fadb1
--- /dev/null
+++ b/crates/core/machine/include/sr.hpp
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <cstdlib>
+
+#include "prelude.hpp"
+#include "utils.hpp"
+
+namespace sp1_core_machine_sys::sr {
+template<class F>
+__SP1_HOSTDEV__ void event_to_row(const AluEvent& event, ShiftRightCols<decltype(F::val)>& cols) {
+    // Initialize cols with basic operands and flags derived from the current event.
+    {
+        cols.shard = F::from_canonical_u32(event.shard).val;
+        write_word_from_u32<F>(cols.a, event.a);
+        write_word_from_u32<F>(cols.b, event.b);
+        write_word_from_u32<F>(cols.c, event.c);
+        cols.b_msb = F::from_canonical_u32((event.b >> 31) & 1).val;
+        cols.is_srl = F::from_bool(event.opcode == Opcode::SRL).val;
+        cols.is_sra = F::from_bool(event.opcode == Opcode::SRA).val;
+        cols.is_real = F::one().val;
+
+        for (uintptr_t i = 0; i < BYTE_SIZE; ++i) {
+            cols.c_least_sig_byte[i] = F::from_canonical_u32((event.c >> i) & 1).val;
+        }
+
+        // // Insert the MSB lookup event.
+        // let most_significant_byte = event.b.to_le_bytes()[WORD_SIZE - 1];
+        // blu.add_byte_lookup_events(vec![ByteLookupEvent {
+        //     shard: event.shard,
+        //     opcode: ByteOpcode::MSB,
+        //     a1: ((most_significant_byte >> 7) & 1) as u16,
+        //     a2: 0,
+        //     b: most_significant_byte,
+        //     c: 0,
+        // }]);
+    }
+
+    // Note that we take the least significant 5 bits per the RISC-V spec.
+    const uintptr_t num_bytes_to_shift = (event.c % 32) / BYTE_SIZE;
+    const uintptr_t num_bits_to_shift = (event.c % 32) % BYTE_SIZE;
+
+    // Byte shifting.
+    // Zero-initialize the array.
+    array_t<uint8_t, LONG_WORD_SIZE> byte_shift_result {};
+    {
+        for (uintptr_t i = 0; i < WORD_SIZE; ++i) {
+            cols.shift_by_n_bytes[i] = F::from_bool(num_bytes_to_shift == i).val;
+        }
+        // Sign extension is necessary only for arithmetic right shift.
+        array_t<uint8_t, 8> sign_extended_b = event.opcode == Opcode::SRA
+            ? u64_to_le_bytes((int64_t)(int32_t)event.b)
+            : u64_to_le_bytes((uint64_t)event.b);
+
+        for (uintptr_t i = 0; i < LONG_WORD_SIZE - num_bytes_to_shift; ++i) {
+            byte_shift_result[i] = sign_extended_b[i + num_bytes_to_shift];
+            cols.byte_shift_result[i] =
+                F::from_canonical_u8(sign_extended_b[i + num_bytes_to_shift]).val;
+        }
+    }
+
+    // Bit shifting.
+    {
+        for (uintptr_t i = 0; i < BYTE_SIZE; ++i) {
+            cols.shift_by_n_bits[i] = F::from_bool(num_bits_to_shift == i).val;
+        }
+        const uint32_t carry_multiplier = 1 << (8 - num_bits_to_shift);
+        uint32_t last_carry = 0;
+        array_t<uint8_t, LONG_WORD_SIZE> bit_shift_result;
+        array_t<uint8_t, LONG_WORD_SIZE> shr_carry_output_carry;
+        array_t<uint8_t, LONG_WORD_SIZE> shr_carry_output_shifted_byte;
+        for (intptr_t i = LONG_WORD_SIZE - 1; i >= 0; --i) {
+            auto [shift, carry] = shr_carry(byte_shift_result[i], num_bits_to_shift);
+
+            // let byte_event = ByteLookupEvent {
+            //     shard: event.shard,
+            //     opcode: ByteOpcode::ShrCarry,
+            //     a1: shift as u16,
+            //     a2: carry,
+            //     b: byte_shift_result[i],
+            //     c: num_bits_to_shift as u8,
+            // };
+            // blu.add_byte_lookup_event(byte_event);
+
+            shr_carry_output_carry[i] = carry;
+            cols.shr_carry_output_carry[i] = F::from_canonical_u8(carry).val;
+
+            shr_carry_output_shifted_byte[i] = shift;
+            cols.shr_carry_output_shifted_byte[i] = F::from_canonical_u8(shift).val;
+
+            uint8_t res = (uint8_t)(((uint32_t)shift + last_carry * carry_multiplier) & 0xFF);
+            bit_shift_result[i] = res;
+            cols.bit_shift_result[i] = F::from_canonical_u8(res).val;
+            last_carry = (uint32_t)carry;
+        }
+        // for (uintptr_t i = 0; i < WORD_SIZE; ++i)
+        // {
+        //     assert(cols.a[i] == cols.bit_shift_result[i]);
+        // }
+        // // Range checks.
+        // blu.add_u8_range_checks(event.shard, &byte_shift_result);
+        // blu.add_u8_range_checks(event.shard, &bit_shift_result);
+        // blu.add_u8_range_checks(event.shard, &shr_carry_output_carry);
+        // blu.add_u8_range_checks(event.shard, &shr_carry_output_shifted_byte);
+    }
+}
+}  // namespace sp1::sr
\ No newline at end of file
diff --git a/crates/core/machine/include/sys.hpp b/crates/core/machine/include/sys.hpp
new file mode 100644
index 0000000000..2a6f56166b
--- /dev/null
+++ b/crates/core/machine/include/sys.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "add_sub.hpp"
+#include "bitwise.hpp"
+#include "cpu.hpp"
+#include "lt.hpp"
+#include "memory.hpp"
+#include "mul.hpp"
+#include "sll.hpp"
+#include "sp1-core-machine-sys-cbindgen.hpp"
+#include "sr.hpp"
+#include "memory_local.hpp"
+#include "memory_global.hpp"
+#include "syscall.hpp"
diff --git a/crates/core/machine/include/syscall.hpp b/crates/core/machine/include/syscall.hpp
new file mode 100644
index 0000000000..caa91b32f8
--- /dev/null
+++ b/crates/core/machine/include/syscall.hpp
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "prelude.hpp"
+#include "utils.hpp"
+#include "bb31_septic_extension_t.hpp"
+
+namespace sp1_core_machine_sys::syscall {
+    template<class F, class EF7> __SP1_HOSTDEV__ void populate_syscall(GlobalInteractionOperation<F>* cols, const SyscallEvent* event, bool is_receive) {
+        EF7 x_start;
+
+        {
+            x_start.value[0] = F::from_canonical_u32(event->shard + 8 * (1 << 24));
+            x_start.value[1] = F::from_canonical_u32(event->clk & ((1 << 16) - 1));
+            x_start.value[2] = F::from_canonical_u32((event->clk) >> 16);
+            x_start.value[3] = F::from_canonical_u32(event->syscall_id);
+            x_start.value[4] = F::from_canonical_u32(event->arg1);
+            x_start.value[5] = F::from_canonical_u32(event->arg2);
+            x_start.value[6] = F::zero();
+        }
+
+        #pragma unroll(1)
+        for(uint32_t offset = 0 ; offset < 256 ; offset++) {
+            EF7 x_trial = x_start.universal_hash();
+            EF7 y_sq = x_trial.curve_formula();
+            F y_sq_pow_r = y_sq.pow_r();
+            F is_square = y_sq_pow_r ^ 1006632960;
+            if(is_square == F::one()) {
+                EF7 y = y_sq.sqrt(y_sq_pow_r);
+                if (y.is_exception()) {
+                    x_start += F::from_canonical_u32(1 << 16);
+                    continue;
+                }
+                if (y.is_receive() != is_receive) {
+                    y = EF7::zero() - y;
+                }
+                // x_trial, y
+                for(uint32_t idx = 0 ; idx < 8 ; idx++ ) {
+                    cols->offset_bits[idx] = F::from_canonical_u32((offset >> idx) & 1);
+                }
+                for(uintptr_t i = 0 ; i < 7 ; i++) {
+                    cols->x_coordinate._0[i] = x_trial.value[i];
+                    cols->y_coordinate._0[i] = y.value[i];
+                }
+                uint32_t range_check_value;
+                if (is_receive) {
+                    range_check_value = y.value[6].as_canonical_u32() - 1;
+                } else {
+                    range_check_value = y.value[6].as_canonical_u32() - (F::MOD + 1) / 2;
+                }
+                F top_4_bits = F::zero();
+                for(uint32_t idx = 0 ; idx < 30 ; idx++) {
+                    cols->y6_bit_decomp[idx] = F::from_canonical_u32((range_check_value >> idx) & 1);
+                    if (idx >= 26) {
+                        top_4_bits += cols->y6_bit_decomp[idx];
+                    }
+                }
+                top_4_bits -= F::from_canonical_u32(4);
+                cols->range_check_witness = top_4_bits.reciprocal();
+                return;
+            }
+            x_start += F::from_canonical_u32(1 << 16);
+        }
+        assert(false);
+    }
+
+    template<class F, class EF7>
+    __SP1_HOSTDEV__ void event_to_row(const SyscallEvent* event, const bool is_receive, SyscallCols<F>* cols) {
+        populate_syscall<F, EF7>(&cols->global_interaction_cols, event, is_receive);
+        cols->shard = F::from_canonical_u32(event->shard);
+        cols->clk_16 = F::from_canonical_u32(event->clk & ((1 << 16) - 1));
+        cols->clk_8 = F::from_canonical_u32((event->clk) >> 16);
+        cols->syscall_id = F::from_canonical_u32(event->syscall_id);
+        cols->arg1 = F::from_canonical_u32(event->arg1);
+        cols->arg2 = F::from_canonical_u32(event->arg2);
+        cols->is_real = F::one();
+    }
+}  // namespace sp1::memory_local
diff --git a/crates/core/machine/include/utils.hpp b/crates/core/machine/include/utils.hpp
new file mode 100644
index 0000000000..7f798ce314
--- /dev/null
+++ b/crates/core/machine/include/utils.hpp
@@ -0,0 +1,134 @@
+#pragma once
+
+#include <cstddef>
+#include <tuple>
+
+#include "prelude.hpp"
+
+namespace sp1_core_machine_sys {
+
+// Compiles to a no-op with -O3 and the like.
+__SP1_HOSTDEV__ __SP1_INLINE__ array_t<uint8_t, 4> u32_to_le_bytes(uint32_t n) {
+    return {
+        (uint8_t)(n >> 8 * 0),
+        (uint8_t)(n >> 8 * 1),
+        (uint8_t)(n >> 8 * 2),
+        (uint8_t)(n >> 8 * 3),
+    };
+}
+
+__SP1_HOSTDEV__ __SP1_INLINE__ array_t<uint8_t, 8> u64_to_le_bytes(uint64_t n) {
+    return {
+        (uint8_t)(n >> 8 * 0),
+        (uint8_t)(n >> 8 * 1),
+        (uint8_t)(n >> 8 * 2),
+        (uint8_t)(n >> 8 * 3),
+        (uint8_t)(n >> 8 * 4),
+        (uint8_t)(n >> 8 * 5),
+        (uint8_t)(n >> 8 * 6),
+        (uint8_t)(n >> 8 * 7),
+    };
+}
+
+/// Shifts a byte to the right and returns both the shifted byte and the bits that carried.
+__SP1_HOSTDEV__ __SP1_INLINE__ std::tuple<uint8_t, uint8_t>
+shr_carry(uint8_t input, uint8_t rotation) {
+    uint8_t c_mod = rotation & 0x7;
+    if (c_mod != 0) {
+        uint8_t res = input >> c_mod;
+        uint8_t c_mod_comp = 8 - c_mod;
+        uint8_t carry = (uint8_t)(input << c_mod_comp) >> c_mod_comp;
+        return {res, carry};
+    } else {
+        return {input, 0};
+    }
+}
+
+template<class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void
+write_word_from_u32(Word<decltype(F::val)>& word, const uint32_t value) {
+    // Coercion to `uint8_t` truncates the number.
+    word._0[0] = F::from_canonical_u8(value).val;
+    word._0[1] = F::from_canonical_u8(value >> 8).val;
+    word._0[2] = F::from_canonical_u8(value >> 16).val;
+    word._0[3] = F::from_canonical_u8(value >> 24).val;
+}
+
+template<class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void
+write_word_from_u32_v2(Word<F>& word, const uint32_t value) {
+    word._0[0] = F::from_canonical_u8(value);
+    word._0[1] = F::from_canonical_u8(value >> 8);
+    word._0[2] = F::from_canonical_u8(value >> 16);
+    word._0[3] = F::from_canonical_u8(value >> 24);
+}
+
+template<class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ uint32_t
+word_to_u32(const Word<decltype(F::val)>& word) {
+    return ((uint8_t)F(word._0[0]).as_canonical_u32())
+        + ((uint8_t)F(word._0[1]).as_canonical_u32() << 8)
+        + ((uint8_t)F(word._0[1]).as_canonical_u32() << 16)
+        + ((uint8_t)F(word._0[1]).as_canonical_u32() << 24);
+}
+
+template<class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void word_from_le_bytes(
+    Word<decltype(F::val)>& word,
+    const array_t<uint8_t, 4> bytes
+) {
+    // Coercion to `uint8_t` truncates the number.
+    word._0[0] = F::from_canonical_u8(bytes[0]).val;
+    word._0[1] = F::from_canonical_u8(bytes[1]).val;
+    word._0[2] = F::from_canonical_u8(bytes[2]).val;
+    word._0[3] = F::from_canonical_u8(bytes[3]).val;
+}
+
+__SP1_HOSTDEV__ __SP1_INLINE__ uint8_t
+get_msb(const array_t<uint8_t, WORD_SIZE> a) {
+    return (a[WORD_SIZE - 1] >> (BYTE_SIZE - 1)) & 1;
+}
+
+namespace opcode_utils {
+    __SP1_HOSTDEV__ __SP1_INLINE__ bool is_memory(Opcode opcode) {
+        switch (opcode) {
+            case Opcode::LB:
+            case Opcode::LH:
+            case Opcode::LW:
+            case Opcode::LBU:
+            case Opcode::LHU:
+            case Opcode::SB:
+            case Opcode::SH:
+            case Opcode::SW:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    __SP1_HOSTDEV__ __SP1_INLINE__ bool is_branch(Opcode opcode) {
+        switch (opcode) {
+            case Opcode::BEQ:
+            case Opcode::BNE:
+            case Opcode::BLT:
+            case Opcode::BGE:
+            case Opcode::BLTU:
+            case Opcode::BGEU:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    __SP1_HOSTDEV__ __SP1_INLINE__ bool is_jump(Opcode opcode) {
+        switch (opcode) {
+            case Opcode::JAL:
+            case Opcode::JALR:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+}  // namespace opcode_utils
+}  // namespace sp1_core_machine_sys
diff --git a/crates/core/machine/src/alu/add_sub/mod.rs b/crates/core/machine/src/alu/add_sub/mod.rs
index d276820755..357aca4d6e 100644
--- a/crates/core/machine/src/alu/add_sub/mod.rs
+++ b/crates/core/machine/src/alu/add_sub/mod.rs
@@ -5,8 +5,8 @@ use core::{
 
 use hashbrown::HashMap;
 use itertools::Itertools;
-use p3_air::{Air, AirBuilder, BaseAir};
-use p3_field::{AbstractField, PrimeField};
+use p3_air::{Air, BaseAir};
+use p3_field::PrimeField;
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use p3_maybe_rayon::prelude::{ParallelBridge, ParallelIterator};
 use sp1_core_executor::{
@@ -43,9 +43,6 @@ pub struct AddSubCols<T> {
     /// The shard number, used for byte lookup table.
     pub shard: T,
 
-    /// The nonce of the operation.
-    pub nonce: T,
-
     /// Instance of `AddOperation` to handle addition logic in `AddSubChip`'s ALU operations.
     /// It's result will be `a` for the add operation and `b` for the sub operation.
     pub add_operation: AddOperation<T>,
@@ -98,7 +95,6 @@ impl<F: PrimeField> MachineAir<F> for AddSubChip {
                         let event = &merged_events[idx];
                         self.event_to_row(event, cols, &mut byte_lookup_events);
                     }
-                    cols.nonce = F::from_canonical_usize(idx);
                 });
             },
         );
@@ -137,6 +133,10 @@ impl<F: PrimeField> MachineAir<F> for AddSubChip {
             !shard.add_events.is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl AddSubChip {
@@ -175,12 +175,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &AddSubCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &AddSubCols<AB::Var> = (*next).borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         // Evaluate the addition operation.
         AddOperation::<AB::F>::eval(
@@ -199,7 +193,6 @@ where
             local.operand_1,
             local.operand_2,
             local.shard,
-            local.nonce,
             local.is_add,
         );
 
@@ -210,7 +203,6 @@ where
             local.add_operation.value,
             local.operand_2,
             local.shard,
-            local.nonce,
             local.is_sub,
         );
 
@@ -224,14 +216,45 @@ where
 #[cfg(test)]
 mod tests {
     use p3_baby_bear::BabyBear;
+    use p3_field::AbstractField;
     use p3_matrix::dense::RowMajorMatrix;
+    use p3_maybe_rayon::prelude::ParallelSlice;
     use rand::{thread_rng, Rng};
     use sp1_core_executor::{events::AluEvent, ExecutionRecord, Opcode};
     use sp1_stark::{air::MachineAir, baby_bear_poseidon2::BabyBearPoseidon2, StarkGenericConfig};
+    use std::borrow::BorrowMut;
+    use std::sync::LazyLock;
 
-    use super::AddSubChip;
+    use super::*;
+    use crate::utils::pad_rows_fixed;
     use crate::utils::{uni_stark_prove as prove, uni_stark_verify as verify};
 
+    /// Lazily initialized record for use across multiple tests.
+    /// Consists of random `ADD` and `SUB` instructions.
+    static SHARD: LazyLock<ExecutionRecord> = LazyLock::new(|| {
+        let add_events = (0..1)
+            .flat_map(|i| {
+                [{
+                    let operand_1 = 1u32;
+                    let operand_2 = 2u32;
+                    let result = operand_1.wrapping_add(operand_2);
+                    AluEvent::new(i % 2, 0, Opcode::ADD, result, operand_1, operand_2)
+                }]
+            })
+            .collect::<Vec<_>>();
+        let _sub_events = (0..255)
+            .flat_map(|i| {
+                [{
+                    let operand_1 = thread_rng().gen_range(0..u32::MAX);
+                    let operand_2 = thread_rng().gen_range(0..u32::MAX);
+                    let result = operand_1.wrapping_add(operand_2);
+                    AluEvent::new(i % 2, 0, Opcode::SUB, result, operand_1, operand_2)
+                }]
+            })
+            .collect::<Vec<_>>();
+        ExecutionRecord { add_events, ..Default::default() }
+    });
+
     #[test]
     fn generate_trace() {
         let mut shard = ExecutionRecord::default();
@@ -248,7 +271,7 @@ mod tests {
         let mut challenger = config.challenger();
 
         let mut shard = ExecutionRecord::default();
-        for i in 0..255 {
+        for i in 0..1 {
             let operand_1 = thread_rng().gen_range(0..u32::MAX);
             let operand_2 = thread_rng().gen_range(0..u32::MAX);
             let result = operand_1.wrapping_add(operand_2);
@@ -283,4 +306,54 @@ mod tests {
         let mut challenger = config.challenger();
         verify(&config, &chip, &mut challenger, &proof).unwrap();
     }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        let shard = LazyLock::force(&SHARD);
+
+        let chip = AddSubChip::default();
+        let trace: RowMajorMatrix<BabyBear> =
+            chip.generate_trace(shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(shard);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let chunk_size =
+            std::cmp::max((input.add_events.len() + input.sub_events.len()) / num_cpus::get(), 1);
+
+        let events = input.add_events.iter().chain(input.sub_events.iter()).collect::<Vec<_>>();
+        let row_batches = events
+            .par_chunks(chunk_size)
+            .map(|events| {
+                let rows = events
+                    .iter()
+                    .map(|event| {
+                        let mut row = [F::zero(); NUM_ADD_SUB_COLS];
+                        let cols: &mut AddSubCols<F> = row.as_mut_slice().borrow_mut();
+                        unsafe {
+                            crate::sys::add_sub_event_to_row_babybear(event, cols);
+                        }
+                        row
+                    })
+                    .collect::<Vec<_>>();
+                rows
+            })
+            .collect::<Vec<_>>();
+
+        let mut rows: Vec<[F; NUM_ADD_SUB_COLS]> = vec![];
+        for row_batch in row_batches {
+            rows.extend(row_batch);
+        }
+
+        pad_rows_fixed(&mut rows, || [F::zero(); NUM_ADD_SUB_COLS], None);
+
+        // Convert the trace to a row major matrix.
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_ADD_SUB_COLS)
+    }
 }
diff --git a/crates/core/machine/src/alu/bitwise/mod.rs b/crates/core/machine/src/alu/bitwise/mod.rs
index 88156e286c..1cb3605dd9 100644
--- a/crates/core/machine/src/alu/bitwise/mod.rs
+++ b/crates/core/machine/src/alu/bitwise/mod.rs
@@ -5,8 +5,8 @@ use core::{
 
 use hashbrown::HashMap;
 use itertools::Itertools;
-use p3_air::{Air, AirBuilder, BaseAir};
-use p3_field::{AbstractField, PrimeField};
+use p3_air::{Air, BaseAir};
+use p3_field::PrimeField;
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use p3_maybe_rayon::prelude::{IntoParallelRefIterator, ParallelIterator, ParallelSlice};
 use sp1_core_executor::{
@@ -35,9 +35,6 @@ pub struct BitwiseCols<T> {
     /// The shard number, used for byte lookup table.
     pub shard: T,
 
-    /// The nonce of the operation.
-    pub nonce: T,
-
     /// The output operand.
     pub a: Word<T>,
 
@@ -91,16 +88,7 @@ impl<F: PrimeField> MachineAir<F> for BitwiseChip {
         );
 
         // Convert the trace to a row major matrix.
-        let mut trace =
-            RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_BITWISE_COLS);
-
-        for i in 0..trace.height() {
-            let cols: &mut BitwiseCols<F> =
-                trace.values[i * NUM_BITWISE_COLS..(i + 1) * NUM_BITWISE_COLS].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_BITWISE_COLS)
     }
 
     fn generate_dependencies(&self, input: &Self::Record, output: &mut Self::Record) {
@@ -130,6 +118,10 @@ impl<F: PrimeField> MachineAir<F> for BitwiseChip {
             !shard.bitwise_events.is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl BitwiseChip {
@@ -181,12 +173,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &BitwiseCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &BitwiseCols<AB::Var> = (*next).borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         // Get the opcode for the operation.
         let opcode = local.is_xor * ByteOpcode::XOR.as_field::<AB::F>()
@@ -211,7 +197,6 @@ where
             local.b,
             local.c,
             local.shard,
-            local.nonce,
             local.is_xor + local.is_or + local.is_and,
         );
 
diff --git a/crates/core/machine/src/alu/divrem/mod.rs b/crates/core/machine/src/alu/divrem/mod.rs
index 1d4d539fc4..28a7e07798 100644
--- a/crates/core/machine/src/alu/divrem/mod.rs
+++ b/crates/core/machine/src/alu/divrem/mod.rs
@@ -103,9 +103,6 @@ pub struct DivRemCols<T> {
     /// The shard number, used for byte lookup table.
     pub shard: T,
 
-    /// The nonce of the operation.
-    pub nonce: T,
-
     /// The output operand.
     pub a: Word<T>,
 
@@ -185,22 +182,11 @@ pub struct DivRemCols<T> {
     /// Flag to indicate whether `c` is negative.
     pub c_neg: T,
 
-    /// The lower nonce of the operation.
-    pub lower_nonce: T,
-
-    /// The upper nonce of the operation.
-    pub upper_nonce: T,
-
-    /// The absolute nonce of the operation.
-    pub abs_nonce: T,
-
     /// Selector to determine whether an ALU Event is sent for absolute value computation of `c`.
     pub abs_c_alu_event: T,
-    pub abs_c_alu_event_nonce: T,
 
     /// Selector to determine whether an ALU Event is sent for absolute value computation of `rem`.
     pub abs_rem_alu_event: T,
-    pub abs_rem_alu_event_nonce: T,
 
     /// Selector to know whether this row is enabled.
     pub is_real: T,
@@ -278,21 +264,7 @@ impl<F: PrimeField> MachineAir<F> for DivRemChip {
 
                 // Set the `alu_event` flags.
                 cols.abs_c_alu_event = cols.c_neg * cols.is_real;
-                cols.abs_c_alu_event_nonce = F::from_canonical_u32(
-                    input
-                        .nonce_lookup
-                        .get(event.sub_lookups[4].0 as usize)
-                        .copied()
-                        .unwrap_or_default(),
-                );
                 cols.abs_rem_alu_event = cols.rem_neg * cols.is_real;
-                cols.abs_rem_alu_event_nonce = F::from_canonical_u32(
-                    input
-                        .nonce_lookup
-                        .get(event.sub_lookups[5].0 as usize)
-                        .copied()
-                        .unwrap_or_default(),
-                );
 
                 // Insert the MSB lookup events.
                 {
@@ -349,41 +321,6 @@ impl<F: PrimeField> MachineAir<F> for DivRemChip {
                     cols.carry[i] = F::from_canonical_u32(carry[i]);
                 }
 
-                // Insert the necessary multiplication & LT events.
-                {
-                    cols.lower_nonce = F::from_canonical_u32(
-                        input
-                            .nonce_lookup
-                            .get(event.sub_lookups[0].0 as usize)
-                            .copied()
-                            .unwrap_or_default(),
-                    );
-                    cols.upper_nonce = F::from_canonical_u32(
-                        input
-                            .nonce_lookup
-                            .get(event.sub_lookups[1].0 as usize)
-                            .copied()
-                            .unwrap_or_default(),
-                    );
-                    if is_signed_operation(event.opcode) {
-                        cols.abs_nonce = F::from_canonical_u32(
-                            input
-                                .nonce_lookup
-                                .get(event.sub_lookups[2].0 as usize)
-                                .copied()
-                                .unwrap_or_default(),
-                        );
-                    } else {
-                        cols.abs_nonce = F::from_canonical_u32(
-                            input
-                                .nonce_lookup
-                                .get(event.sub_lookups[3].0 as usize)
-                                .copied()
-                                .unwrap_or_default(),
-                        );
-                    };
-                }
-
                 // Range check.
                 {
                     output.add_u8_range_checks(event.shard, &quotient.to_le_bytes());
@@ -426,13 +363,6 @@ impl<F: PrimeField> MachineAir<F> for DivRemChip {
             trace.values[i] = padded_row_template[i % NUM_DIVREM_COLS];
         }
 
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut DivRemCols<F> =
-                trace.values[i * NUM_DIVREM_COLS..(i + 1) * NUM_DIVREM_COLS].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
         trace
     }
 
@@ -443,6 +373,10 @@ impl<F: PrimeField> MachineAir<F> for DivRemChip {
             !shard.divrem_events.is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<F> BaseAir<F> for DivRemChip {
@@ -459,16 +393,10 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &DivRemCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &DivRemCols<AB::Var> = (*next).borrow();
         let base = AB::F::from_canonical_u32(1 << 8);
         let one: AB::Expr = AB::F::one().into();
         let zero: AB::Expr = AB::F::zero().into();
 
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
-
         // Calculate whether b, remainder, and c are negative.
         {
             // Negative if and only if op code is signed & MSB = 1.
@@ -502,7 +430,6 @@ where
                 local.quotient,
                 local.c,
                 local.shard,
-                local.lower_nonce,
                 local.is_real,
             );
 
@@ -527,7 +454,6 @@ where
                 local.quotient,
                 local.c,
                 local.shard,
-                local.upper_nonce,
                 local.is_real,
             );
         }
@@ -685,7 +611,6 @@ where
                 local.c,
                 local.abs_c,
                 local.shard,
-                local.abs_c_alu_event_nonce,
                 local.abs_c_alu_event,
             );
             builder.send_alu(
@@ -694,7 +619,6 @@ where
                 local.remainder,
                 local.abs_remainder,
                 local.shard,
-                local.abs_rem_alu_event_nonce,
                 local.abs_rem_alu_event,
             );
 
@@ -740,7 +664,6 @@ where
                 local.abs_remainder,
                 local.max_abs_c_or_1,
                 local.shard,
-                local.abs_nonce,
                 local.remainder_check_multiplicity,
             );
         }
@@ -816,15 +739,7 @@ where
                     + local.is_rem * rem
             };
 
-            builder.receive_alu(
-                opcode,
-                local.a,
-                local.b,
-                local.c,
-                local.shard,
-                local.nonce,
-                local.is_real,
-            );
+            builder.receive_alu(opcode, local.a, local.b, local.c, local.shard, local.is_real);
         }
     }
 }
diff --git a/crates/core/machine/src/alu/lt/mod.rs b/crates/core/machine/src/alu/lt/mod.rs
index 876fdaaf8f..48280ae993 100644
--- a/crates/core/machine/src/alu/lt/mod.rs
+++ b/crates/core/machine/src/alu/lt/mod.rs
@@ -35,9 +35,6 @@ pub struct LtCols<T> {
     /// The shard number, used for byte lookup table.
     pub shard: T,
 
-    /// The nonce of the operation.
-    pub nonce: T,
-
     /// If the opcode is SLT.
     pub is_slt: T,
 
@@ -124,7 +121,6 @@ impl<F: PrimeField32> MachineAir<F> for LtChip {
                         let event = &input.lt_events[idx];
                         self.event_to_row(event, cols, &mut byte_lookup_events);
                     }
-                    cols.nonce = F::from_canonical_usize(idx);
                 });
             },
         );
@@ -161,6 +157,10 @@ impl<F: PrimeField32> MachineAir<F> for LtChip {
             !shard.lt_events.is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl LtChip {
@@ -269,12 +269,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &LtCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &LtCols<AB::Var> = (*next).borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         let is_real = local.is_slt + local.is_sltu;
 
@@ -448,7 +442,6 @@ where
             local.b,
             local.c,
             local.shard,
-            local.nonce,
             is_real,
         );
     }
diff --git a/crates/core/machine/src/alu/mul/mod.rs b/crates/core/machine/src/alu/mul/mod.rs
index 6a1ce272fe..38cd34adf8 100644
--- a/crates/core/machine/src/alu/mul/mod.rs
+++ b/crates/core/machine/src/alu/mul/mod.rs
@@ -45,7 +45,7 @@ use sp1_core_executor::{
     ByteOpcode, ExecutionRecord, Opcode, Program,
 };
 use sp1_derive::AlignedBorrow;
-use sp1_primitives::consts::WORD_SIZE;
+use sp1_primitives::consts::{BYTE_SIZE, LONG_WORD_SIZE, WORD_SIZE};
 use sp1_stark::{air::MachineAir, Word};
 
 use crate::{
@@ -57,13 +57,6 @@ use crate::{
 /// The number of main trace columns for `MulChip`.
 pub const NUM_MUL_COLS: usize = size_of::<MulCols<u8>>();
 
-/// The number of digits in the product is at most the sum of the number of digits in the
-/// multiplicands.
-const PRODUCT_SIZE: usize = 2 * WORD_SIZE;
-
-/// The number of bits in a byte.
-const BYTE_SIZE: usize = 8;
-
 /// The mask for a byte.
 const BYTE_MASK: u8 = 0xff;
 
@@ -78,9 +71,6 @@ pub struct MulCols<T> {
     /// The shard number, used for byte lookup table.
     pub shard: T,
 
-    /// The nonce of the operation.
-    pub nonce: T,
-
     /// The output operand.
     pub a: Word<T>,
 
@@ -91,10 +81,10 @@ pub struct MulCols<T> {
     pub c: Word<T>,
 
     /// Trace.
-    pub carry: [T; PRODUCT_SIZE],
+    pub carry: [T; LONG_WORD_SIZE],
 
     /// An array storing the product of `b * c` after the carry propagation.
-    pub product: [T; PRODUCT_SIZE],
+    pub product: [T; LONG_WORD_SIZE],
 
     /// The most significant bit of `b`.
     pub b_msb: T,
@@ -156,13 +146,11 @@ impl<F: PrimeField> MachineAir<F> for MulChip {
                         let event = &input.mul_events[idx];
                         self.event_to_row(event, cols, &mut byte_lookup_events);
                     }
-                    cols.nonce = F::from_canonical_usize(idx);
                 });
             },
         );
 
         // Convert the trace to a row major matrix.
-
         RowMajorMatrix::new(values, NUM_MUL_COLS)
     }
 
@@ -193,6 +181,10 @@ impl<F: PrimeField> MachineAir<F> for MulChip {
             !shard.mul_events.is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl MulChip {
@@ -220,13 +212,13 @@ impl MulChip {
             // If b is signed and it is negative, sign extend b.
             if (event.opcode == Opcode::MULH || event.opcode == Opcode::MULHSU) && b_msb == 1 {
                 cols.b_sign_extend = F::one();
-                b.resize(PRODUCT_SIZE, BYTE_MASK);
+                b.resize(LONG_WORD_SIZE, BYTE_MASK);
             }
 
             // If c is signed and it is negative, sign extend c.
             if event.opcode == Opcode::MULH && c_msb == 1 {
                 cols.c_sign_extend = F::one();
-                c.resize(PRODUCT_SIZE, BYTE_MASK);
+                c.resize(LONG_WORD_SIZE, BYTE_MASK);
             }
 
             // Insert the MSB lookup events.
@@ -248,10 +240,10 @@ impl MulChip {
             }
         }
 
-        let mut product = [0u32; PRODUCT_SIZE];
+        let mut product = [0u32; LONG_WORD_SIZE];
         for i in 0..b.len() {
             for j in 0..c.len() {
-                if i + j < PRODUCT_SIZE {
+                if i + j < LONG_WORD_SIZE {
                     product[i + j] += (b[i] as u32) * (c[j] as u32);
                 }
             }
@@ -260,11 +252,11 @@ impl MulChip {
         // Calculate the correct product using the `product` array. We store the
         // correct carry value for verification.
         let base = (1 << BYTE_SIZE) as u32;
-        let mut carry = [0u32; PRODUCT_SIZE];
-        for i in 0..PRODUCT_SIZE {
+        let mut carry = [0u32; LONG_WORD_SIZE];
+        for i in 0..LONG_WORD_SIZE {
             carry[i] = product[i] / base;
             product[i] %= base;
-            if i + 1 < PRODUCT_SIZE {
+            if i + 1 < LONG_WORD_SIZE {
                 product[i + 1] += carry[i];
             }
             cols.carry[i] = F::from_canonical_u32(carry[i]);
@@ -303,18 +295,12 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &MulCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &MulCols<AB::Var> = (*next).borrow();
         let base = AB::F::from_canonical_u32(1 << 8);
 
         let zero: AB::Expr = AB::F::zero().into();
         let one: AB::Expr = AB::F::one().into();
         let byte_mask = AB::F::from_canonical_u8(BYTE_MASK);
 
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
-
         // Calculate the MSBs.
         let (b_msb, c_msb) = {
             let msb_pairs =
@@ -342,9 +328,9 @@ where
 
         // Sign extend local.b and local.c whenever appropriate.
         let (b, c) = {
-            let mut b: Vec<AB::Expr> = vec![AB::F::zero().into(); PRODUCT_SIZE];
-            let mut c: Vec<AB::Expr> = vec![AB::F::zero().into(); PRODUCT_SIZE];
-            for i in 0..PRODUCT_SIZE {
+            let mut b: Vec<AB::Expr> = vec![AB::F::zero().into(); LONG_WORD_SIZE];
+            let mut c: Vec<AB::Expr> = vec![AB::F::zero().into(); LONG_WORD_SIZE];
+            for i in 0..LONG_WORD_SIZE {
                 if i < WORD_SIZE {
                     b[i] = local.b[i].into();
                     c[i] = local.c[i].into();
@@ -357,10 +343,10 @@ where
         };
 
         // Compute the uncarried product b(x) * c(x) = m(x).
-        let mut m: Vec<AB::Expr> = vec![AB::F::zero().into(); PRODUCT_SIZE];
-        for i in 0..PRODUCT_SIZE {
-            for j in 0..PRODUCT_SIZE {
-                if i + j < PRODUCT_SIZE {
+        let mut m: Vec<AB::Expr> = vec![AB::F::zero().into(); LONG_WORD_SIZE];
+        for i in 0..LONG_WORD_SIZE {
+            for j in 0..LONG_WORD_SIZE {
+                if i + j < LONG_WORD_SIZE {
                     m[i + j] = m[i + j].clone() + b[i].clone() * c[j].clone();
                 }
             }
@@ -368,7 +354,7 @@ where
 
         // Propagate carry.
         let product = {
-            for i in 0..PRODUCT_SIZE {
+            for i in 0..LONG_WORD_SIZE {
                 if i == 0 {
                     builder.assert_eq(local.product[i], m[i].clone() - local.carry[i] * base);
                 } else {
@@ -441,15 +427,7 @@ where
         }
 
         // Receive the arguments.
-        builder.receive_alu(
-            opcode,
-            local.a,
-            local.b,
-            local.c,
-            local.shard,
-            local.nonce,
-            local.is_real,
-        );
+        builder.receive_alu(opcode, local.a, local.b, local.c, local.shard, local.is_real);
     }
 }
 
diff --git a/crates/core/machine/src/alu/sll/mod.rs b/crates/core/machine/src/alu/sll/mod.rs
index af00dd47a5..fcd84a98c2 100644
--- a/crates/core/machine/src/alu/sll/mod.rs
+++ b/crates/core/machine/src/alu/sll/mod.rs
@@ -68,9 +68,6 @@ pub struct ShiftLeftCols<T> {
     /// The shard number, used for byte lookup table.
     pub shard: T,
 
-    /// The nonce of the operation.
-    pub nonce: T,
-
     /// The output operand.
     pub a: Word<T>,
 
@@ -154,12 +151,6 @@ impl<F: PrimeField> MachineAir<F> for ShiftLeft {
             trace.values[i] = padded_row_template[i % NUM_SHIFT_LEFT_COLS];
         }
 
-        for i in 0..trace.height() {
-            let cols: &mut ShiftLeftCols<F> =
-                trace.values[i * NUM_SHIFT_LEFT_COLS..(i + 1) * NUM_SHIFT_LEFT_COLS].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
         trace
     }
 
@@ -190,6 +181,10 @@ impl<F: PrimeField> MachineAir<F> for ShiftLeft {
             !shard.shift_left_events.is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl ShiftLeft {
@@ -270,17 +265,11 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &ShiftLeftCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &ShiftLeftCols<AB::Var> = (*next).borrow();
 
         let zero: AB::Expr = AB::F::zero().into();
         let one: AB::Expr = AB::F::one().into();
         let base: AB::Expr = AB::F::from_canonical_u32(1 << BYTE_SIZE).into();
 
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
-
         // We first "bit shift" and next we "byte shift". Then we compare the results with a.
         // Finally, we perform some misc checks.
 
@@ -392,7 +381,6 @@ where
             local.b,
             local.c,
             local.shard,
-            local.nonce,
             local.is_real,
         );
     }
diff --git a/crates/core/machine/src/alu/sr/mod.rs b/crates/core/machine/src/alu/sr/mod.rs
index b26c949945..916755a310 100644
--- a/crates/core/machine/src/alu/sr/mod.rs
+++ b/crates/core/machine/src/alu/sr/mod.rs
@@ -88,9 +88,6 @@ pub struct ShiftRightCols<T> {
     /// The shard number, used for byte lookup table.
     pub shard: T,
 
-    /// The nonce of the operation.
-    pub nonce: T,
-
     /// The output operand.
     pub a: Word<T>,
 
@@ -169,7 +166,6 @@ impl<F: PrimeField> MachineAir<F> for ShiftRightChip {
                         cols.shift_by_n_bits[0] = F::one();
                         cols.shift_by_n_bytes[0] = F::one();
                     }
-                    cols.nonce = F::from_canonical_usize(idx);
                 });
             },
         );
@@ -205,6 +201,10 @@ impl<F: PrimeField> MachineAir<F> for ShiftRightChip {
             !shard.shift_right_events.is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl ShiftRightChip {
@@ -329,15 +329,9 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &ShiftRightCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &ShiftRightCols<AB::Var> = (*next).borrow();
         let zero: AB::Expr = AB::F::zero().into();
         let one: AB::Expr = AB::F::one().into();
 
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
-
         // Check that the MSB of most_significant_byte matches local.b_msb using lookup.
         {
             let byte = local.b[WORD_SIZE - 1];
@@ -513,7 +507,6 @@ where
             local.b,
             local.c,
             local.shard,
-            local.nonce,
             local.is_real,
         );
     }
diff --git a/crates/core/machine/src/bytes/trace.rs b/crates/core/machine/src/bytes/trace.rs
index 6cfb81bbfc..d62374ceb5 100644
--- a/crates/core/machine/src/bytes/trace.rs
+++ b/crates/core/machine/src/bytes/trace.rs
@@ -1,6 +1,6 @@
 use std::borrow::BorrowMut;
 
-use p3_field::Field;
+use p3_field::PrimeField;
 use p3_matrix::dense::RowMajorMatrix;
 use sp1_core_executor::{ByteOpcode, ExecutionRecord, Program};
 use sp1_stark::air::MachineAir;
@@ -14,7 +14,7 @@ use super::{
 
 pub const NUM_ROWS: usize = 1 << 16;
 
-impl<F: Field> MachineAir<F> for ByteChip<F> {
+impl<F: PrimeField> MachineAir<F> for ByteChip<F> {
     type Record = ExecutionRecord;
 
     type Program = Program;
diff --git a/crates/core/machine/src/cpu/air/branch.rs b/crates/core/machine/src/cpu/air/branch.rs
index d8c615682f..176f316da2 100644
--- a/crates/core/machine/src/cpu/air/branch.rs
+++ b/crates/core/machine/src/cpu/air/branch.rs
@@ -88,7 +88,6 @@ impl CpuChip {
                 branch_cols.pc,
                 local.op_c_val(),
                 local.shard,
-                branch_cols.next_pc_nonce,
                 local.branching,
             );
 
@@ -185,7 +184,6 @@ impl CpuChip {
             local.op_a_val(),
             local.op_b_val(),
             local.shard,
-            branch_cols.a_lt_b_nonce,
             is_branch_instruction.clone(),
         );
 
@@ -197,7 +195,6 @@ impl CpuChip {
             local.op_b_val(),
             local.op_a_val(),
             local.shard,
-            branch_cols.a_gt_b_nonce,
             is_branch_instruction.clone(),
         );
     }
diff --git a/crates/core/machine/src/cpu/air/ecall.rs b/crates/core/machine/src/cpu/air/ecall.rs
index 59785123fb..3d8b70b7c1 100644
--- a/crates/core/machine/src/cpu/air/ecall.rs
+++ b/crates/core/machine/src/cpu/air/ecall.rs
@@ -54,7 +54,6 @@ impl CpuChip {
         builder.send_syscall(
             local.shard,
             local.clk,
-            ecall_cols.syscall_nonce,
             syscall_id,
             local.op_b_val().reduce::<AB>(),
             local.op_c_val().reduce::<AB>(),
diff --git a/crates/core/machine/src/cpu/air/memory.rs b/crates/core/machine/src/cpu/air/memory.rs
index 79054d1abf..70eede3b80 100644
--- a/crates/core/machine/src/cpu/air/memory.rs
+++ b/crates/core/machine/src/cpu/air/memory.rs
@@ -72,7 +72,6 @@ impl CpuChip {
             local.op_b_val(),
             local.op_c_val(),
             local.shard,
-            memory_columns.addr_word_nonce,
             is_memory_instruction.clone(),
         );
 
@@ -174,7 +173,6 @@ impl CpuChip {
             local.unsigned_mem_val,
             signed_value,
             local.shard,
-            local.unsigned_mem_val_nonce,
             local.mem_value_is_neg_not_x0,
         );
 
diff --git a/crates/core/machine/src/cpu/air/mod.rs b/crates/core/machine/src/cpu/air/mod.rs
index 35b6dc18af..38d40e8437 100644
--- a/crates/core/machine/src/cpu/air/mod.rs
+++ b/crates/core/machine/src/cpu/air/mod.rs
@@ -66,7 +66,6 @@ where
             local.op_b_val(),
             local.op_c_val(),
             local.shard,
-            local.nonce,
             is_alu_instruction,
         );
 
@@ -196,7 +195,6 @@ impl CpuChip {
             jump_columns.pc,
             local.op_b_val(),
             local.shard,
-            jump_columns.jal_nonce,
             local.selectors.is_jal,
         );
 
@@ -207,7 +205,6 @@ impl CpuChip {
             local.op_b_val(),
             local.op_c_val(),
             local.shard,
-            jump_columns.jalr_nonce,
             local.selectors.is_jalr,
         );
     }
@@ -235,7 +232,6 @@ impl CpuChip {
             auipc_columns.pc,
             local.op_b_val(),
             local.shard,
-            auipc_columns.auipc_nonce,
             local.selectors.is_auipc,
         );
     }
diff --git a/crates/core/machine/src/cpu/columns/auipc.rs b/crates/core/machine/src/cpu/columns/auipc.rs
index 5505f213b3..0e71ff59d4 100644
--- a/crates/core/machine/src/cpu/columns/auipc.rs
+++ b/crates/core/machine/src/cpu/columns/auipc.rs
@@ -12,5 +12,4 @@ pub struct AuipcCols<T> {
     /// The current program counter.
     pub pc: Word<T>,
     pub pc_range_checker: BabyBearWordRangeChecker<T>,
-    pub auipc_nonce: T,
 }
diff --git a/crates/core/machine/src/cpu/columns/branch.rs b/crates/core/machine/src/cpu/columns/branch.rs
index 6f12f5a675..7b67394894 100644
--- a/crates/core/machine/src/cpu/columns/branch.rs
+++ b/crates/core/machine/src/cpu/columns/branch.rs
@@ -26,13 +26,4 @@ pub struct BranchCols<T> {
 
     /// Whether a is less than b.
     pub a_lt_b: T,
-
-    /// The nonce of the operation to compute `a_lt_b`.
-    pub a_lt_b_nonce: T,
-
-    /// The nonce of the operation to compute `a_gt_b`.
-    pub a_gt_b_nonce: T,
-
-    /// The nonce of the operation to compute `next_pc`.
-    pub next_pc_nonce: T,
 }
diff --git a/crates/core/machine/src/cpu/columns/ecall.rs b/crates/core/machine/src/cpu/columns/ecall.rs
index ea737c169e..0158940e24 100644
--- a/crates/core/machine/src/cpu/columns/ecall.rs
+++ b/crates/core/machine/src/cpu/columns/ecall.rs
@@ -28,9 +28,6 @@ pub struct EcallCols<T> {
     /// should be set to 1 and everything else set to 0.
     pub index_bitmap: [T; PV_DIGEST_NUM_WORDS],
 
-    /// The nonce of the syscall operation.
-    pub syscall_nonce: T,
-
     /// Columns to babybear range check the halt/commit_deferred_proofs operand.
     pub operand_range_check_cols: BabyBearWordRangeChecker<T>,
 
diff --git a/crates/core/machine/src/cpu/columns/jump.rs b/crates/core/machine/src/cpu/columns/jump.rs
index 579f2b5160..f5c2b51917 100644
--- a/crates/core/machine/src/cpu/columns/jump.rs
+++ b/crates/core/machine/src/cpu/columns/jump.rs
@@ -19,7 +19,4 @@ pub struct JumpCols<T> {
 
     // A range checker for `op_a` which may contain `pc + 4`.
     pub op_a_range_checker: BabyBearWordRangeChecker<T>,
-
-    pub jal_nonce: T,
-    pub jalr_nonce: T,
 }
diff --git a/crates/core/machine/src/cpu/columns/memory.rs b/crates/core/machine/src/cpu/columns/memory.rs
index 3eb52337ab..b4123714f8 100644
--- a/crates/core/machine/src/cpu/columns/memory.rs
+++ b/crates/core/machine/src/cpu/columns/memory.rs
@@ -33,7 +33,4 @@ pub struct MemoryColumns<T> {
     // LE bit decomposition for the most significant byte of memory value.  This is used to
     // determine the sign for that value (used for LB and LH).
     pub most_sig_byte_decomp: [T; 8],
-
-    pub addr_word_nonce: T,
-    pub unsigned_mem_val_nonce: T,
 }
diff --git a/crates/core/machine/src/cpu/columns/mod.rs b/crates/core/machine/src/cpu/columns/mod.rs
index 7a32b03db5..be820ab67d 100644
--- a/crates/core/machine/src/cpu/columns/mod.rs
+++ b/crates/core/machine/src/cpu/columns/mod.rs
@@ -34,8 +34,6 @@ pub struct CpuCols<T: Copy> {
     /// The current shard.
     pub shard: T,
 
-    pub nonce: T,
-
     /// The clock cycle value.  This should be within 24 bits.
     pub clk: T,
     /// The least significant 16 bit limb of clk.
@@ -101,8 +99,6 @@ pub struct CpuCols<T: Copy> {
     /// memory opcodes (i.e. LB, LH, LW, LBU, and LHU).
     pub unsigned_mem_val: Word<T>,
 
-    pub unsigned_mem_val_nonce: T,
-
     /// The result of selectors.is_ecall * the send_to_table column for the ECALL opcode.
     pub ecall_mul_send_to_table: T,
 
diff --git a/crates/core/machine/src/cpu/trace.rs b/crates/core/machine/src/cpu/trace.rs
index 5a43202608..7b246c0307 100644
--- a/crates/core/machine/src/cpu/trace.rs
+++ b/crates/core/machine/src/cpu/trace.rs
@@ -58,14 +58,7 @@ impl<F: PrimeField32> MachineAir<F> for CpuChip {
                         let mut byte_lookup_events = Vec::new();
                         let event = &input.cpu_events[idx];
                         let instruction = &input.program.fetch(event.pc);
-                        self.event_to_row(
-                            event,
-                            &input.nonce_lookup,
-                            cols,
-                            &mut byte_lookup_events,
-                            shard,
-                            instruction,
-                        );
+                        self.event_to_row(event, cols, &mut byte_lookup_events, shard, instruction);
                     }
                 });
             },
@@ -91,14 +84,7 @@ impl<F: PrimeField32> MachineAir<F> for CpuChip {
                     let mut row = [F::zero(); NUM_CPU_COLS];
                     let cols: &mut CpuCols<F> = row.as_mut_slice().borrow_mut();
                     let instruction = &input.program.fetch(op.pc);
-                    self.event_to_row::<F>(
-                        op,
-                        &input.nonce_lookup,
-                        cols,
-                        &mut blu,
-                        shard,
-                        instruction,
-                    );
+                    self.event_to_row::<F>(op, cols, &mut blu, shard, instruction);
                 });
                 blu
             })
@@ -121,7 +107,6 @@ impl CpuChip {
     fn event_to_row<F: PrimeField32>(
         &self,
         event: &CpuEvent,
-        nonce_lookup: &[u32],
         cols: &mut CpuCols<F>,
         blu_events: &mut impl ByteRecord,
         shard: u32,
@@ -130,11 +115,6 @@ impl CpuChip {
         // Populate shard and clk columns.
         self.populate_shard_clk(cols, event, blu_events, shard);
 
-        // Populate the nonce.
-        cols.nonce = F::from_canonical_u32(
-            nonce_lookup.get(event.alu_lookup_id.0 as usize).copied().unwrap_or_default(),
-        );
-
         // Populate basic fields.
         cols.pc = F::from_canonical_u32(event.pc);
         cols.next_pc = F::from_canonical_u32(event.next_pc);
@@ -188,11 +168,11 @@ impl CpuChip {
         }
 
         // Populate memory, branch, jump, and auipc specific fields.
-        self.populate_memory(cols, event, blu_events, nonce_lookup, shard, instruction);
-        self.populate_branch(cols, event, nonce_lookup, instruction);
-        self.populate_jump(cols, event, nonce_lookup, instruction);
-        self.populate_auipc(cols, event, nonce_lookup, instruction);
-        let is_halt = self.populate_ecall(cols, event, nonce_lookup);
+        self.populate_memory(cols, event, blu_events, shard, instruction);
+        self.populate_branch(cols, event, instruction);
+        self.populate_jump(cols, event, instruction);
+        self.populate_auipc(cols, event, instruction);
+        let is_halt = self.populate_ecall(cols, event);
 
         cols.is_sequential_instr = F::from_bool(
             !instruction.is_branch_instruction() && !instruction.is_jump_instruction() && !is_halt,
@@ -250,7 +230,6 @@ impl CpuChip {
         cols: &mut CpuCols<F>,
         event: &CpuEvent,
         blu_events: &mut impl ByteRecord,
-        nonce_lookup: &[u32],
         shard: u32,
         instruction: &Instruction,
     ) {
@@ -281,9 +260,6 @@ impl CpuChip {
         let aligned_addr_ls_byte = (aligned_addr & 0x000000FF) as u8;
         let bits: [bool; 8] = array::from_fn(|i| aligned_addr_ls_byte & (1 << i) != 0);
         memory_columns.aa_least_sig_byte_decomp = array::from_fn(|i| F::from_bool(bits[i + 2]));
-        memory_columns.addr_word_nonce = F::from_canonical_u32(
-            nonce_lookup.get(event.memory_add_lookup_id.0 as usize).copied().unwrap_or_default(),
-        );
 
         // Populate memory offsets.
         let addr_offset = (memory_addr % WORD_SIZE as u32) as u8;
@@ -331,12 +307,6 @@ impl CpuChip {
                 }
                 if memory_columns.most_sig_byte_decomp[7] == F::one() {
                     cols.mem_value_is_neg_not_x0 = F::from_bool(instruction.op_a != (X0 as u8));
-                    cols.unsigned_mem_val_nonce = F::from_canonical_u32(
-                        nonce_lookup
-                            .get(event.memory_sub_lookup_id.0 as usize)
-                            .copied()
-                            .unwrap_or_default(),
-                    );
                 }
             }
 
@@ -368,7 +338,6 @@ impl CpuChip {
         &self,
         cols: &mut CpuCols<F>,
         event: &CpuEvent,
-        nonce_lookup: &[u32],
         instruction: &Instruction,
     ) {
         if instruction.is_branch_instruction() {
@@ -389,14 +358,6 @@ impl CpuChip {
                 event.a > event.b
             };
 
-            branch_columns.a_lt_b_nonce = F::from_canonical_u32(
-                nonce_lookup.get(event.branch_lt_lookup_id.0 as usize).copied().unwrap_or_default(),
-            );
-
-            branch_columns.a_gt_b_nonce = F::from_canonical_u32(
-                nonce_lookup.get(event.branch_gt_lookup_id.0 as usize).copied().unwrap_or_default(),
-            );
-
             branch_columns.a_eq_b = F::from_bool(a_eq_b);
             branch_columns.a_lt_b = F::from_bool(a_lt_b);
             branch_columns.a_gt_b = F::from_bool(a_gt_b);
@@ -417,12 +378,6 @@ impl CpuChip {
 
             if branching {
                 cols.branching = F::one();
-                branch_columns.next_pc_nonce = F::from_canonical_u32(
-                    nonce_lookup
-                        .get(event.branch_add_lookup_id.0 as usize)
-                        .copied()
-                        .unwrap_or_default(),
-                );
             } else {
                 cols.not_branching = F::one();
             }
@@ -434,7 +389,6 @@ impl CpuChip {
         &self,
         cols: &mut CpuCols<F>,
         event: &CpuEvent,
-        nonce_lookup: &[u32],
         instruction: &Instruction,
     ) {
         if instruction.is_jump_instruction() {
@@ -448,24 +402,12 @@ impl CpuChip {
                     jump_columns.pc_range_checker.populate(event.pc);
                     jump_columns.next_pc = Word::from(next_pc);
                     jump_columns.next_pc_range_checker.populate(next_pc);
-                    jump_columns.jal_nonce = F::from_canonical_u32(
-                        nonce_lookup
-                            .get(event.jump_jal_lookup_id.0 as usize)
-                            .copied()
-                            .unwrap_or_default(),
-                    );
                 }
                 Opcode::JALR => {
                     let next_pc = event.b.wrapping_add(event.c);
                     jump_columns.op_a_range_checker.populate(event.a);
                     jump_columns.next_pc = Word::from(next_pc);
                     jump_columns.next_pc_range_checker.populate(next_pc);
-                    jump_columns.jalr_nonce = F::from_canonical_u32(
-                        nonce_lookup
-                            .get(event.jump_jalr_lookup_id.0 as usize)
-                            .copied()
-                            .unwrap_or_default(),
-                    );
                 }
                 _ => unreachable!(),
             }
@@ -477,7 +419,6 @@ impl CpuChip {
         &self,
         cols: &mut CpuCols<F>,
         event: &CpuEvent,
-        nonce_lookup: &[u32],
         instruction: &Instruction,
     ) {
         if matches!(instruction.opcode, Opcode::AUIPC) {
@@ -485,19 +426,11 @@ impl CpuChip {
 
             auipc_columns.pc = Word::from(event.pc);
             auipc_columns.pc_range_checker.populate(event.pc);
-            auipc_columns.auipc_nonce = F::from_canonical_u32(
-                nonce_lookup.get(event.auipc_lookup_id.0 as usize).copied().unwrap_or_default(),
-            );
         }
     }
 
     /// Populate columns related to ECALL.
-    fn populate_ecall<F: PrimeField>(
-        &self,
-        cols: &mut CpuCols<F>,
-        event: &CpuEvent,
-        nonce_lookup: &[u32],
-    ) -> bool {
+    fn populate_ecall<F: PrimeField>(&self, cols: &mut CpuCols<F>, event: &CpuEvent) -> bool {
         let mut is_halt = false;
 
         if cols.selectors.is_ecall == F::one() {
@@ -548,10 +481,6 @@ impl CpuChip {
                 ecall_cols.index_bitmap[digest_idx] = F::one();
             }
 
-            // Write the syscall nonce.
-            ecall_cols.syscall_nonce =
-                F::from_canonical_u32(nonce_lookup[event.syscall_lookup_id.0 as usize]);
-
             is_halt = syscall_id == F::from_canonical_u32(SyscallCode::HALT.syscall_id());
 
             // For halt and commit deferred proofs syscalls, we need to baby bear range check one of
diff --git a/crates/core/machine/src/lib.rs b/crates/core/machine/src/lib.rs
index 168be94eac..8e2b661cd5 100644
--- a/crates/core/machine/src/lib.rs
+++ b/crates/core/machine/src/lib.rs
@@ -23,6 +23,8 @@ pub mod memory;
 pub mod operations;
 pub mod program;
 pub mod riscv;
+#[cfg(feature = "sys")]
+pub mod sys;
 pub mod syscall;
 pub mod utils;
 
@@ -31,7 +33,7 @@ pub mod utils;
 /// This string should be updated whenever any step in verifying an SP1 proof changes, including
 /// core, recursion, and plonk-bn254. This string is used to download SP1 artifacts and the gnark
 /// docker image.
-pub const SP1_CIRCUIT_VERSION: &str = "v3.0.0";
+pub const SP1_CIRCUIT_VERSION: &str = "v4.0.0-rc.1";
 
 // Re-export the `SP1ReduceProof` struct from sp1_core_machine.
 //
diff --git a/crates/core/machine/src/memory/global.rs b/crates/core/machine/src/memory/global.rs
index db58e9c351..615b769bfb 100644
--- a/crates/core/machine/src/memory/global.rs
+++ b/crates/core/machine/src/memory/global.rs
@@ -1,28 +1,35 @@
+use super::MemoryChipType;
+use crate::{
+    operations::GlobalAccumulationOperation,
+    operations::GlobalInteractionOperation,
+    operations::{AssertLtColsBits, BabyBearBitDecomposition, IsZeroOperation},
+    utils::pad_rows_fixed,
+};
 use core::{
     borrow::{Borrow, BorrowMut},
     mem::size_of,
 };
-use std::array;
-
+use hashbrown::HashMap;
+use itertools::Itertools;
 use p3_air::{Air, AirBuilder, BaseAir};
 use p3_field::{AbstractField, PrimeField32};
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
-use sp1_core_executor::{events::MemoryInitializeFinalizeEvent, ExecutionRecord, Program};
+use p3_maybe_rayon::prelude::{IntoParallelRefIterator, ParallelIterator, ParallelSlice};
+use sp1_core_executor::events::ByteLookupEvent;
+use sp1_core_executor::{
+    events::{ByteRecord, MemoryInitializeFinalizeEvent},
+    ExecutionRecord, Program,
+};
 use sp1_derive::AlignedBorrow;
 use sp1_stark::{
     air::{
-        AirInteraction, BaseAirBuilder, InteractionScope, MachineAir, PublicValues, SP1AirBuilder,
+        BaseAirBuilder, InteractionScope, MachineAir, PublicValues, SP1AirBuilder,
         SP1_PROOF_NUM_PV_ELTS,
     },
-    InteractionKind, Word,
+    septic_digest::SepticDigest,
+    Word,
 };
-
-use crate::{
-    operations::{AssertLtColsBits, BabyBearBitDecomposition, IsZeroOperation},
-    utils::pad_rows_fixed,
-};
-
-use super::MemoryChipType;
+use std::array;
 
 /// A memory chip that can initialize or finalize values in memory.
 pub struct MemoryGlobalChip {
@@ -54,8 +61,46 @@ impl<F: PrimeField32> MachineAir<F> for MemoryGlobalChip {
         }
     }
 
-    fn generate_dependencies(&self, _input: &ExecutionRecord, _output: &mut ExecutionRecord) {
-        // Do nothing since this chip has no dependencies.
+    fn generate_dependencies(&self, input: &ExecutionRecord, output: &mut ExecutionRecord) {
+        let mut memory_events = match self.kind {
+            MemoryChipType::Initialize => input.global_memory_initialize_events.clone(),
+            MemoryChipType::Finalize => input.global_memory_finalize_events.clone(),
+        };
+
+        let is_receive = match self.kind {
+            MemoryChipType::Initialize => false,
+            MemoryChipType::Finalize => true,
+        };
+
+        memory_events.sort_by_key(|event| event.addr);
+        let chunk_size = std::cmp::max(memory_events.len() / num_cpus::get(), 1);
+
+        let blu_batches = memory_events
+            .par_chunks(chunk_size)
+            .map(|events| {
+                let mut blu: HashMap<u32, HashMap<ByteLookupEvent, usize>> = HashMap::new();
+                events.iter().for_each(|event| {
+                    let MemoryInitializeFinalizeEvent {
+                        addr: _addr,
+                        value,
+                        shard,
+                        timestamp: _timestamp,
+                        used,
+                    } = event.to_owned();
+                    let interaction_shard = if is_receive { shard } else { 0 };
+                    let mut row = [F::zero(); NUM_MEMORY_INIT_COLS];
+                    let cols: &mut MemoryInitCols<F> = row.as_mut_slice().borrow_mut();
+                    cols.global_interaction_cols.populate_memory_range_check_witness(
+                        interaction_shard,
+                        value,
+                        used != 0,
+                        &mut blu,
+                    );
+                });
+                blu
+            })
+            .collect::<Vec<_>>();
+        output.add_sharded_byte_lookup_events(blu_batches.iter().collect_vec());
     }
 
     fn generate_trace(
@@ -73,11 +118,19 @@ impl<F: PrimeField32> MachineAir<F> for MemoryGlobalChip {
             MemoryChipType::Finalize => input.public_values.previous_finalize_addr_bits,
         };
 
+        let is_receive = match self.kind {
+            MemoryChipType::Initialize => false,
+            MemoryChipType::Finalize => true,
+        };
+
+        let mut global_cumulative_sum = SepticDigest::<F>::zero().0;
+
         memory_events.sort_by_key(|event| event.addr);
-        let mut rows: Vec<[F; NUM_MEMORY_INIT_COLS]> = (0..memory_events.len()) // OPT: change this to par_iter
-            .map(|i| {
+        let mut rows: Vec<[F; NUM_MEMORY_INIT_COLS]> = memory_events
+            .par_iter()
+            .map(|event| {
                 let MemoryInitializeFinalizeEvent { addr, value, shard, timestamp, used } =
-                    memory_events[i];
+                    event.to_owned();
 
                 let mut row = [F::zero(); NUM_MEMORY_INIT_COLS];
                 let cols: &mut MemoryInitCols<F> = row.as_mut_slice().borrow_mut();
@@ -88,39 +141,60 @@ impl<F: PrimeField32> MachineAir<F> for MemoryGlobalChip {
                 cols.value = array::from_fn(|i| F::from_canonical_u32((value >> i) & 1));
                 cols.is_real = F::from_canonical_u32(used);
 
-                if i == 0 {
-                    let prev_addr = previous_addr_bits
-                        .iter()
-                        .enumerate()
-                        .map(|(j, bit)| bit * (1 << j))
-                        .sum::<u32>();
-                    cols.is_prev_addr_zero.populate(prev_addr);
-                    cols.is_first_comp = F::from_bool(prev_addr != 0);
-                    if prev_addr != 0 {
-                        debug_assert!(prev_addr < addr, "prev_addr {} < addr {}", prev_addr, addr);
-                        let addr_bits: [_; 32] = array::from_fn(|i| (addr >> i) & 1);
-                        cols.lt_cols.populate(&previous_addr_bits, &addr_bits);
-                    }
-                }
+                let interaction_shard = if is_receive { shard } else { 0 };
+                let interaction_clk = if is_receive { timestamp } else { 0 };
+
+                cols.global_interaction_cols.populate_memory(
+                    interaction_shard,
+                    interaction_clk,
+                    addr,
+                    value,
+                    is_receive,
+                    used != 0,
+                );
 
-                if i != 0 {
-                    let prev_is_real = memory_events[i - 1].used;
-                    cols.is_next_comp = F::from_canonical_u32(prev_is_real);
-                    let previous_addr = memory_events[i - 1].addr;
-                    assert_ne!(previous_addr, addr);
+                row
+            })
+            .collect::<Vec<_>>();
 
+        for i in 0..memory_events.len() {
+            let addr = memory_events[i].addr;
+            let cols: &mut MemoryInitCols<F> = rows[i].as_mut_slice().borrow_mut();
+            if i == 0 {
+                let prev_addr = previous_addr_bits
+                    .iter()
+                    .enumerate()
+                    .map(|(j, bit)| bit * (1 << j))
+                    .sum::<u32>();
+                cols.is_prev_addr_zero.populate(prev_addr);
+                cols.is_first_comp = F::from_bool(prev_addr != 0);
+                if prev_addr != 0 {
+                    debug_assert!(prev_addr < addr, "prev_addr {} < addr {}", prev_addr, addr);
                     let addr_bits: [_; 32] = array::from_fn(|i| (addr >> i) & 1);
-                    let prev_addr_bits: [_; 32] = array::from_fn(|i| (previous_addr >> i) & 1);
-                    cols.lt_cols.populate(&prev_addr_bits, &addr_bits);
+                    cols.lt_cols.populate(&previous_addr_bits, &addr_bits);
                 }
+            }
+            if i != 0 {
+                let prev_is_real = memory_events[i - 1].used;
+                cols.is_next_comp = F::from_canonical_u32(prev_is_real);
+                let previous_addr = memory_events[i - 1].addr;
+                assert_ne!(previous_addr, addr);
+
+                let addr_bits: [_; 32] = array::from_fn(|i| (addr >> i) & 1);
+                let prev_addr_bits: [_; 32] = array::from_fn(|i| (previous_addr >> i) & 1);
+                cols.lt_cols.populate(&prev_addr_bits, &addr_bits);
+            }
 
-                if i == memory_events.len() - 1 {
-                    cols.is_last_addr = F::one();
-                }
+            if i == memory_events.len() - 1 {
+                cols.is_last_addr = F::one();
+            }
 
-                row
-            })
-            .collect::<Vec<_>>();
+            cols.global_accumulation_cols.populate(
+                &mut global_cumulative_sum,
+                [cols.global_interaction_cols],
+                [cols.is_real],
+            );
+        }
 
         // Pad the trace to a power of two depending on the proof shape in `input`.
         pad_rows_fixed(
@@ -129,7 +203,23 @@ impl<F: PrimeField32> MachineAir<F> for MemoryGlobalChip {
             input.fixed_log2_rows::<F, Self>(self),
         );
 
-        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_MEMORY_INIT_COLS)
+        let mut trace = RowMajorMatrix::new(
+            rows.into_iter().flatten().collect::<Vec<_>>(),
+            NUM_MEMORY_INIT_COLS,
+        );
+
+        for i in memory_events.len()..trace.height() {
+            let cols: &mut MemoryInitCols<F> =
+                trace.values[i * NUM_MEMORY_INIT_COLS..(i + 1) * NUM_MEMORY_INIT_COLS].borrow_mut();
+            cols.global_interaction_cols.populate_dummy();
+            cols.global_accumulation_cols.populate(
+                &mut global_cumulative_sum,
+                [cols.global_interaction_cols],
+                [cols.is_real],
+            );
+        }
+
+        trace
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -148,6 +238,8 @@ impl<F: PrimeField32> MachineAir<F> for MemoryGlobalChip {
     }
 }
 
+pub const MEMORY_GLOBAL_INITIAL_DIGEST_POS_COPY: usize = 161;
+
 #[derive(AlignedBorrow, Debug, Clone, Copy)]
 #[repr(C)]
 pub struct MemoryInitCols<T> {
@@ -172,6 +264,9 @@ pub struct MemoryInitCols<T> {
     /// Whether the memory access is a real access.
     pub is_real: T,
 
+    /// The columns for sending a global interaction.
+    pub global_interaction_cols: GlobalInteractionOperation<T>,
+
     /// Whether or not we are making the assertion `addr < addr_next`.
     pub is_next_comp: T,
 
@@ -183,6 +278,9 @@ pub struct MemoryInitCols<T> {
 
     /// A flag to indicate the last non-padded address. An auxiliary column needed for degree 3.
     pub is_last_addr: T,
+
+    /// The columns for accumulating the elliptic curve digests.
+    pub global_accumulation_cols: GlobalAccumulationOperation<T, 1>,
 }
 
 pub(crate) const NUM_MEMORY_INIT_COLS: usize = size_of::<MemoryInitCols<u8>>();
@@ -217,20 +315,41 @@ where
 
         if self.kind == MemoryChipType::Initialize {
             let mut values = vec![AB::Expr::zero(), AB::Expr::zero(), local.addr.into()];
-            values.extend(value.map(Into::into));
-            builder.send(
-                AirInteraction::new(values, local.is_real.into(), InteractionKind::Memory),
-                InteractionScope::Global,
+            values.extend(value.clone().map(Into::into));
+            GlobalInteractionOperation::<AB::F>::eval_single_digest_memory(
+                builder,
+                AB::Expr::zero(),
+                AB::Expr::zero(),
+                local.addr.into(),
+                value,
+                local.global_interaction_cols,
+                false,
+                local.is_real,
             );
         } else {
             let mut values = vec![local.shard.into(), local.timestamp.into(), local.addr.into()];
-            values.extend(value);
-            builder.receive(
-                AirInteraction::new(values, local.is_real.into(), InteractionKind::Memory),
-                InteractionScope::Global,
+            values.extend(value.clone());
+            GlobalInteractionOperation::<AB::F>::eval_single_digest_memory(
+                builder,
+                local.shard.into(),
+                local.timestamp.into(),
+                local.addr.into(),
+                value,
+                local.global_interaction_cols,
+                true,
+                local.is_real,
             );
         }
 
+        GlobalAccumulationOperation::<AB::F, 1>::eval_accumulation(
+            builder,
+            [local.global_interaction_cols],
+            [local.is_real],
+            [next.is_real],
+            local.global_accumulation_cols,
+            next.global_accumulation_cols,
+        );
+
         // Canonically decompose the address into bits so we can do comparisons.
         BabyBearBitDecomposition::<AB::F>::range_check(
             builder,
@@ -369,6 +488,7 @@ mod tests {
     };
     use p3_baby_bear::BabyBear;
     use sp1_core_executor::{programs::tests::simple_program, Executor};
+    use sp1_stark::InteractionKind;
     use sp1_stark::{
         baby_bear_poseidon2::BabyBearPoseidon2, debug_interactions_with_all_chips, SP1CoreOpts,
         StarkMachine,
diff --git a/crates/core/machine/src/memory/local.rs b/crates/core/machine/src/memory/local.rs
index 8be4377031..ba8043ff27 100644
--- a/crates/core/machine/src/memory/local.rs
+++ b/crates/core/machine/src/memory/local.rs
@@ -1,27 +1,58 @@
 use std::{
     borrow::{Borrow, BorrowMut},
-    mem::size_of,
+    mem::{size_of, transmute},
 };
 
-use crate::utils::{next_power_of_two, zeroed_f_vec};
+use crate::utils::{indices_arr, next_power_of_two, zeroed_f_vec};
+use crate::{operations::GlobalAccumulationOperation, operations::GlobalInteractionOperation};
+use hashbrown::HashMap;
+use itertools::Itertools;
 use p3_air::{Air, BaseAir};
 use p3_field::PrimeField32;
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
-use p3_maybe_rayon::prelude::{ParallelBridge, ParallelIterator};
+use p3_maybe_rayon::prelude::IndexedParallelIterator;
+use p3_maybe_rayon::prelude::IntoParallelIterator;
+use p3_maybe_rayon::prelude::IntoParallelRefMutIterator;
+use p3_maybe_rayon::prelude::{ParallelBridge, ParallelIterator, ParallelSlice};
+use rayon_scan::ScanParallelIterator;
+use sp1_core_executor::events::ByteLookupEvent;
+use sp1_core_executor::events::ByteRecord;
 use sp1_core_executor::{ExecutionRecord, Program};
 use sp1_derive::AlignedBorrow;
 use sp1_stark::{
     air::{AirInteraction, InteractionScope, MachineAir, SP1AirBuilder},
+    septic_curve::SepticCurve,
+    septic_curve::SepticCurveComplete,
+    septic_digest::SepticDigest,
+    septic_extension::SepticExtension,
     InteractionKind, Word,
 };
 
+/// Creates the column map for the CPU.
+const fn make_col_map() -> MemoryLocalCols<usize> {
+    let indices_arr = indices_arr::<NUM_MEMORY_LOCAL_INIT_COLS>();
+    unsafe { transmute::<[usize; NUM_MEMORY_LOCAL_INIT_COLS], MemoryLocalCols<usize>>(indices_arr) }
+}
+
+const MEMORY_LOCAL_COL_MAP: MemoryLocalCols<usize> = make_col_map();
+
+pub const MEMORY_LOCAL_INITIAL_DIGEST_POS: usize =
+    MEMORY_LOCAL_COL_MAP.global_accumulation_cols.initial_digest[0].0[0];
+
+pub const MEMORY_LOCAL_INITIAL_DIGEST_POS_COPY: usize = 480;
+
+#[repr(C)]
+pub struct Ghost {
+    pub v: [usize; MEMORY_LOCAL_INITIAL_DIGEST_POS_COPY],
+}
+
 pub const NUM_LOCAL_MEMORY_ENTRIES_PER_ROW: usize = 4;
 
 pub(crate) const NUM_MEMORY_LOCAL_INIT_COLS: usize = size_of::<MemoryLocalCols<u8>>();
 
 #[derive(AlignedBorrow, Debug, Clone, Copy)]
 #[repr(C)]
-struct SingleMemoryLocal<T> {
+pub struct SingleMemoryLocal<T> {
     /// The address of the memory access.
     pub addr: T,
 
@@ -43,6 +74,12 @@ struct SingleMemoryLocal<T> {
     /// The final value of the memory access.
     pub final_value: Word<T>,
 
+    /// The global interaction columns for initial access.
+    pub initial_global_interaction_cols: GlobalInteractionOperation<T>,
+
+    /// The global interaction columns for final access.
+    pub final_global_interaction_cols: GlobalInteractionOperation<T>,
+
     /// Whether the memory access is a real access.
     pub is_real: T,
 }
@@ -51,6 +88,7 @@ struct SingleMemoryLocal<T> {
 #[repr(C)]
 pub struct MemoryLocalCols<T> {
     memory_local_entries: [SingleMemoryLocal<T>; NUM_LOCAL_MEMORY_ENTRIES_PER_ROW],
+    pub global_accumulation_cols: GlobalAccumulationOperation<T, 8>,
 }
 
 pub struct MemoryLocalChip {}
@@ -64,6 +102,7 @@ impl MemoryLocalChip {
 
 impl<F> BaseAir<F> for MemoryLocalChip {
     fn width(&self) -> usize {
+        assert_eq!(MEMORY_LOCAL_INITIAL_DIGEST_POS_COPY, MEMORY_LOCAL_INITIAL_DIGEST_POS);
         NUM_MEMORY_LOCAL_INIT_COLS
     }
 }
@@ -77,8 +116,43 @@ impl<F: PrimeField32> MachineAir<F> for MemoryLocalChip {
         "MemoryLocal".to_string()
     }
 
-    fn generate_dependencies(&self, _input: &ExecutionRecord, _output: &mut ExecutionRecord) {
-        // Do nothing since this chip has no dependencies.
+    fn generate_dependencies(&self, input: &ExecutionRecord, output: &mut ExecutionRecord) {
+        let events = input.get_local_mem_events().collect::<Vec<_>>();
+        let nb_rows = (events.len() + 3) / 4;
+        let chunk_size = std::cmp::max((nb_rows + 1) / num_cpus::get(), 1);
+
+        let blu_batches = events
+            .par_chunks(chunk_size * NUM_LOCAL_MEMORY_ENTRIES_PER_ROW)
+            .map(|events| {
+                let mut blu: HashMap<u32, HashMap<ByteLookupEvent, usize>> = HashMap::new();
+                events.chunks(NUM_LOCAL_MEMORY_ENTRIES_PER_ROW).for_each(|events| {
+                    let mut row = [F::zero(); NUM_MEMORY_LOCAL_INIT_COLS];
+                    let cols: &mut MemoryLocalCols<F> = row.as_mut_slice().borrow_mut();
+                    for k in 0..NUM_LOCAL_MEMORY_ENTRIES_PER_ROW {
+                        let cols = &mut cols.memory_local_entries[k];
+                        if k < events.len() {
+                            let event = events[k];
+                            cols.initial_global_interaction_cols
+                                .populate_memory_range_check_witness(
+                                    event.initial_mem_access.shard,
+                                    event.initial_mem_access.value,
+                                    true,
+                                    &mut blu,
+                                );
+                            cols.final_global_interaction_cols.populate_memory_range_check_witness(
+                                event.final_mem_access.shard,
+                                event.final_mem_access.value,
+                                true,
+                                &mut blu,
+                            );
+                        }
+                    }
+                });
+                blu
+            })
+            .collect::<Vec<_>>();
+
+        output.add_sharded_byte_lookup_events(blu_batches.iter().collect_vec());
     }
 
     fn generate_trace(
@@ -92,13 +166,21 @@ impl<F: PrimeField32> MachineAir<F> for MemoryLocalChip {
         let size_log2 = input.fixed_log2_rows::<F, _>(self);
         let padded_nb_rows = next_power_of_two(nb_rows, size_log2);
         let mut values = zeroed_f_vec(padded_nb_rows * NUM_MEMORY_LOCAL_INIT_COLS);
-        let chunk_size = std::cmp::max((nb_rows + 1) / num_cpus::get(), 1);
+        let chunk_size = std::cmp::max(nb_rows / num_cpus::get(), 0) + 1;
 
-        values
+        let mut chunks = values[..nb_rows * NUM_MEMORY_LOCAL_INIT_COLS]
             .chunks_mut(chunk_size * NUM_MEMORY_LOCAL_INIT_COLS)
+            .collect::<Vec<_>>();
+
+        let point_chunks = chunks
+            .par_iter_mut()
             .enumerate()
-            .par_bridge()
-            .for_each(|(i, rows)| {
+            .map(|(i, rows)| {
+                let mut point_chunks =
+                    Vec::with_capacity(chunk_size * NUM_LOCAL_MEMORY_ENTRIES_PER_ROW * 2 + 1);
+                if i == 0 {
+                    point_chunks.push(SepticCurveComplete::Affine(SepticDigest::<F>::zero().0));
+                }
                 rows.chunks_mut(NUM_MEMORY_LOCAL_INIT_COLS).enumerate().for_each(|(j, row)| {
                     let idx = (i * chunk_size + j) * NUM_LOCAL_MEMORY_ENTRIES_PER_ROW;
 
@@ -118,9 +200,101 @@ impl<F: PrimeField32> MachineAir<F> for MemoryLocalChip {
                             cols.initial_value = event.initial_mem_access.value.into();
                             cols.final_value = event.final_mem_access.value.into();
                             cols.is_real = F::one();
+                            cols.initial_global_interaction_cols.populate_memory(
+                                event.initial_mem_access.shard,
+                                event.initial_mem_access.timestamp,
+                                event.addr,
+                                event.initial_mem_access.value,
+                                true,
+                                true,
+                            );
+                            point_chunks.push(SepticCurveComplete::Affine(SepticCurve {
+                                x: SepticExtension(
+                                    cols.initial_global_interaction_cols.x_coordinate.0,
+                                ),
+                                y: SepticExtension(
+                                    cols.initial_global_interaction_cols.y_coordinate.0,
+                                ),
+                            }));
+                            cols.final_global_interaction_cols.populate_memory(
+                                event.final_mem_access.shard,
+                                event.final_mem_access.timestamp,
+                                event.addr,
+                                event.final_mem_access.value,
+                                false,
+                                true,
+                            );
+                            point_chunks.push(SepticCurveComplete::Affine(SepticCurve {
+                                x: SepticExtension(
+                                    cols.final_global_interaction_cols.x_coordinate.0,
+                                ),
+                                y: SepticExtension(
+                                    cols.final_global_interaction_cols.y_coordinate.0,
+                                ),
+                            }));
+                        } else {
+                            cols.initial_global_interaction_cols.populate_dummy();
+                            cols.final_global_interaction_cols.populate_dummy();
                         }
                     }
                 });
+                point_chunks
+            })
+            .collect::<Vec<_>>();
+
+        let mut points = Vec::with_capacity(1 + events.len() * 2);
+        for mut point_chunk in point_chunks {
+            points.append(&mut point_chunk);
+        }
+
+        if events.is_empty() {
+            points = vec![SepticCurveComplete::Affine(SepticDigest::<F>::zero().0)];
+        }
+
+        let cumulative_sum = points
+            .into_par_iter()
+            .with_min_len(1 << 15)
+            .scan(|a, b| *a + *b, SepticCurveComplete::Infinity)
+            .collect::<Vec<SepticCurveComplete<F>>>();
+
+        let final_digest = cumulative_sum.last().unwrap().point();
+        let dummy = SepticCurve::<F>::dummy();
+        let final_sum_checker = SepticCurve::<F>::sum_checker_x(final_digest, dummy, final_digest);
+
+        let chunk_size = std::cmp::max(padded_nb_rows / num_cpus::get(), 0) + 1;
+        values
+            .chunks_mut(chunk_size * NUM_MEMORY_LOCAL_INIT_COLS)
+            .enumerate()
+            .par_bridge()
+            .for_each(|(i, rows)| {
+                rows.chunks_mut(NUM_MEMORY_LOCAL_INIT_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = i * chunk_size + j;
+
+                    let cols: &mut MemoryLocalCols<F> = row.borrow_mut();
+                    if idx < nb_rows {
+                        let start = NUM_LOCAL_MEMORY_ENTRIES_PER_ROW * 2 * idx;
+                        let end = std::cmp::min(
+                            NUM_LOCAL_MEMORY_ENTRIES_PER_ROW * 2 * (idx + 1) + 1,
+                            cumulative_sum.len(),
+                        );
+                        cols.global_accumulation_cols.populate_real(
+                            &cumulative_sum[start..end],
+                            final_digest,
+                            final_sum_checker,
+                        );
+                    } else {
+                        for k in 0..NUM_LOCAL_MEMORY_ENTRIES_PER_ROW {
+                            cols.memory_local_entries[k]
+                                .initial_global_interaction_cols
+                                .populate_dummy();
+                            cols.memory_local_entries[k]
+                                .final_global_interaction_cols
+                                .populate_dummy();
+                        }
+                        cols.global_accumulation_cols
+                            .populate_dummy(final_digest, final_sum_checker);
+                    }
+                })
             });
 
         // Convert the trace to a row major matrix.
@@ -148,6 +322,12 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &MemoryLocalCols<AB::Var> = (*local).borrow();
+        let next = main.row_slice(1);
+        let next: &MemoryLocalCols<AB::Var> = (*next).borrow();
+
+        let mut global_interaction_cols = Vec::with_capacity(8);
+        let mut local_is_reals = Vec::with_capacity(8);
+        let mut next_is_reals = Vec::with_capacity(8);
 
         for local in local.memory_local_entries.iter() {
             builder.assert_eq(
@@ -155,39 +335,77 @@ where
                 local.is_real * local.is_real * local.is_real,
             );
 
-            for scope in [InteractionScope::Global, InteractionScope::Local] {
-                let mut values =
-                    vec![local.initial_shard.into(), local.initial_clk.into(), local.addr.into()];
-                values.extend(local.initial_value.map(Into::into));
-                builder.receive(
-                    AirInteraction::new(
-                        values.clone(),
-                        local.is_real.into(),
-                        InteractionKind::Memory,
-                    ),
-                    scope,
-                );
-
-                let mut values =
-                    vec![local.final_shard.into(), local.final_clk.into(), local.addr.into()];
-                values.extend(local.final_value.map(Into::into));
-                builder.send(
-                    AirInteraction::new(
-                        values.clone(),
-                        local.is_real.into(),
-                        InteractionKind::Memory,
-                    ),
-                    scope,
-                );
-            }
+            let mut values =
+                vec![local.initial_shard.into(), local.initial_clk.into(), local.addr.into()];
+            values.extend(local.initial_value.map(Into::into));
+            builder.receive(
+                AirInteraction::new(values.clone(), local.is_real.into(), InteractionKind::Memory),
+                InteractionScope::Local,
+            );
+
+            GlobalInteractionOperation::<AB::F>::eval_single_digest_memory(
+                builder,
+                local.initial_shard.into(),
+                local.initial_clk.into(),
+                local.addr.into(),
+                local.initial_value.map(Into::into).0,
+                local.initial_global_interaction_cols,
+                true,
+                local.is_real,
+            );
+
+            global_interaction_cols.push(local.initial_global_interaction_cols);
+            local_is_reals.push(local.is_real);
+
+            let mut values =
+                vec![local.final_shard.into(), local.final_clk.into(), local.addr.into()];
+            values.extend(local.final_value.map(Into::into));
+            builder.send(
+                AirInteraction::new(values.clone(), local.is_real.into(), InteractionKind::Memory),
+                InteractionScope::Local,
+            );
+
+            GlobalInteractionOperation::<AB::F>::eval_single_digest_memory(
+                builder,
+                local.final_shard.into(),
+                local.final_clk.into(),
+                local.addr.into(),
+                local.final_value.map(Into::into).0,
+                local.final_global_interaction_cols,
+                false,
+                local.is_real,
+            );
+
+            global_interaction_cols.push(local.final_global_interaction_cols);
+            local_is_reals.push(local.is_real);
+        }
+
+        for next in next.memory_local_entries.iter() {
+            next_is_reals.push(next.is_real);
+            next_is_reals.push(next.is_real);
         }
+
+        GlobalAccumulationOperation::<AB::F, 8>::eval_accumulation(
+            builder,
+            global_interaction_cols
+                .try_into()
+                .unwrap_or_else(|_| panic!("There should be 8 interactions")),
+            local_is_reals.try_into().unwrap_or_else(|_| panic!("There should be 8 interactions")),
+            next_is_reals.try_into().unwrap_or_else(|_| panic!("There should be 8 interactions")),
+            local.global_accumulation_cols,
+            next.global_accumulation_cols,
+        );
     }
 }
 
 #[cfg(test)]
 mod tests {
+    use super::*;
     use p3_baby_bear::BabyBear;
     use p3_matrix::dense::RowMajorMatrix;
+    use rand::thread_rng;
+    use rand::Rng;
+    use sp1_core_executor::events::{MemoryLocalEvent, MemoryRecord};
     use sp1_core_executor::{programs::tests::simple_program, ExecutionRecord, Executor};
     use sp1_stark::{
         air::{InteractionScope, MachineAir},
@@ -280,4 +498,167 @@ mod tests {
             InteractionScope::Global,
         );
     }
+
+    #[cfg(feature = "sys")]
+    fn get_test_execution_record() -> ExecutionRecord {
+        let cpu_local_memory_access = (0..=255)
+            .flat_map(|_| {
+                [{
+                    let addr = thread_rng().gen_range(0..BabyBear::ORDER_U32);
+                    let init_value = thread_rng().gen_range(0..u32::MAX);
+                    let init_shard = thread_rng().gen_range(0..(1u32 << 16));
+                    let init_timestamp = thread_rng().gen_range(0..(1u32 << 24));
+                    let final_value = thread_rng().gen_range(0..u32::MAX);
+                    let final_timestamp = thread_rng().gen_range(0..(1u32 << 24));
+                    let final_shard = thread_rng().gen_range(0..(1u32 << 16));
+                    MemoryLocalEvent {
+                        addr,
+                        initial_mem_access: MemoryRecord {
+                            shard: init_shard,
+                            timestamp: init_timestamp,
+                            value: init_value,
+                        },
+                        final_mem_access: MemoryRecord {
+                            shard: final_shard,
+                            timestamp: final_timestamp,
+                            value: final_value,
+                        },
+                    }
+                }]
+            })
+            .collect::<Vec<_>>();
+        ExecutionRecord { cpu_local_memory_access, ..Default::default() }
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        let record = get_test_execution_record();
+        let chip = MemoryLocalChip::new();
+        let trace: RowMajorMatrix<BabyBear> =
+            chip.generate_trace(&record, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&record, trace.height());
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord, height: usize) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+        // Generate the trace rows for each event.
+        let events = input.get_local_mem_events().collect::<Vec<_>>();
+        let nb_rows = (events.len() + 3) / 4;
+        let padded_nb_rows = height;
+        let mut values = zeroed_f_vec(padded_nb_rows * NUM_MEMORY_LOCAL_INIT_COLS);
+        let chunk_size = std::cmp::max(nb_rows / num_cpus::get(), 0) + 1;
+
+        let mut chunks = values[..nb_rows * NUM_MEMORY_LOCAL_INIT_COLS]
+            .chunks_mut(chunk_size * NUM_MEMORY_LOCAL_INIT_COLS)
+            .collect::<Vec<_>>();
+
+        let point_chunks = chunks
+            .par_iter_mut()
+            .enumerate()
+            .map(|(i, rows)| {
+                let mut point_chunks =
+                    Vec::with_capacity(chunk_size * NUM_LOCAL_MEMORY_ENTRIES_PER_ROW * 2 + 1);
+                if i == 0 {
+                    point_chunks.push(SepticCurveComplete::Affine(SepticDigest::<F>::zero().0));
+                }
+                rows.chunks_mut(NUM_MEMORY_LOCAL_INIT_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = (i * chunk_size + j) * NUM_LOCAL_MEMORY_ENTRIES_PER_ROW;
+                    let cols: &mut MemoryLocalCols<F> = row.borrow_mut();
+                    for k in 0..NUM_LOCAL_MEMORY_ENTRIES_PER_ROW {
+                        let cols = &mut cols.memory_local_entries[k];
+                        if idx + k < events.len() {
+                            unsafe {
+                                crate::sys::memory_local_event_to_row_babybear(
+                                    events[idx + k],
+                                    cols,
+                                );
+                            }
+                            point_chunks.push(SepticCurveComplete::Affine(SepticCurve {
+                                x: SepticExtension(
+                                    cols.initial_global_interaction_cols.x_coordinate.0,
+                                ),
+                                y: SepticExtension(
+                                    cols.initial_global_interaction_cols.y_coordinate.0,
+                                ),
+                            }));
+                            point_chunks.push(SepticCurveComplete::Affine(SepticCurve {
+                                x: SepticExtension(
+                                    cols.final_global_interaction_cols.x_coordinate.0,
+                                ),
+                                y: SepticExtension(
+                                    cols.final_global_interaction_cols.y_coordinate.0,
+                                ),
+                            }));
+                        } else {
+                            cols.initial_global_interaction_cols.populate_dummy();
+                            cols.final_global_interaction_cols.populate_dummy();
+                        }
+                    }
+                });
+                point_chunks
+            })
+            .collect::<Vec<_>>();
+
+        let mut points = Vec::with_capacity(1 + events.len() * 2);
+        for mut point_chunk in point_chunks {
+            points.append(&mut point_chunk);
+        }
+
+        if events.is_empty() {
+            points = vec![SepticCurveComplete::Affine(SepticDigest::<F>::zero().0)];
+        }
+
+        let cumulative_sum = points
+            .into_par_iter()
+            .with_min_len(1 << 15)
+            .scan(|a, b| *a + *b, SepticCurveComplete::Infinity)
+            .collect::<Vec<SepticCurveComplete<F>>>();
+
+        let final_digest = cumulative_sum.last().unwrap().point();
+        let dummy = SepticCurve::<F>::dummy();
+        let final_sum_checker = SepticCurve::<F>::sum_checker_x(final_digest, dummy, final_digest);
+
+        let chunk_size = std::cmp::max(padded_nb_rows / num_cpus::get(), 0) + 1;
+        values
+            .chunks_mut(chunk_size * NUM_MEMORY_LOCAL_INIT_COLS)
+            .enumerate()
+            .par_bridge()
+            .for_each(|(i, rows)| {
+                rows.chunks_mut(NUM_MEMORY_LOCAL_INIT_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = i * chunk_size + j;
+
+                    let cols: &mut MemoryLocalCols<F> = row.borrow_mut();
+                    if idx < nb_rows {
+                        let start = NUM_LOCAL_MEMORY_ENTRIES_PER_ROW * 2 * idx;
+                        let end = std::cmp::min(
+                            NUM_LOCAL_MEMORY_ENTRIES_PER_ROW * 2 * (idx + 1) + 1,
+                            cumulative_sum.len(),
+                        );
+                        cols.global_accumulation_cols.populate_real(
+                            &cumulative_sum[start..end],
+                            final_digest,
+                            final_sum_checker,
+                        );
+                    } else {
+                        for k in 0..NUM_LOCAL_MEMORY_ENTRIES_PER_ROW {
+                            cols.memory_local_entries[k]
+                                .initial_global_interaction_cols
+                                .populate_dummy();
+                            cols.memory_local_entries[k]
+                                .final_global_interaction_cols
+                                .populate_dummy();
+                        }
+                        cols.global_accumulation_cols
+                            .populate_dummy(final_digest, final_sum_checker);
+                    }
+                })
+            });
+
+        // Convert the trace to a row major matrix.
+        RowMajorMatrix::new(values, NUM_MEMORY_LOCAL_INIT_COLS)
+    }
 }
diff --git a/crates/core/machine/src/memory/program.rs b/crates/core/machine/src/memory/program.rs
index 4dc8b196ad..3777330842 100644
--- a/crates/core/machine/src/memory/program.rs
+++ b/crates/core/machine/src/memory/program.rs
@@ -4,18 +4,21 @@ use core::{
 };
 use itertools::Itertools;
 use p3_air::{Air, AirBuilder, AirBuilderWithPublicValues, BaseAir, PairBuilder};
-use p3_field::{AbstractField, PrimeField};
+use p3_field::AbstractField;
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 
+use crate::{operations::GlobalAccumulationOperation, operations::GlobalInteractionOperation};
+use hashbrown::HashMap;
+use p3_field::PrimeField32;
 use p3_maybe_rayon::prelude::{ParallelBridge, ParallelIterator};
+use sp1_core_executor::events::ByteLookupEvent;
+use sp1_core_executor::events::ByteRecord;
 use sp1_core_executor::{ExecutionRecord, Program};
 use sp1_derive::AlignedBorrow;
 use sp1_stark::{
-    air::{
-        AirInteraction, InteractionScope, MachineAir, PublicValues, SP1AirBuilder,
-        SP1_PROOF_NUM_PV_ELTS,
-    },
-    InteractionKind, Word,
+    air::{InteractionScope, MachineAir, PublicValues, SP1AirBuilder, SP1_PROOF_NUM_PV_ELTS},
+    septic_digest::SepticDigest,
+    Word,
 };
 
 use crate::{
@@ -47,6 +50,12 @@ pub struct MemoryProgramMultCols<T> {
 
     /// Whether the shard is the first shard.
     pub is_first_shard: IsZeroOperation<T>,
+
+    /// The columns for the global interaction.
+    pub global_interaction_cols: GlobalInteractionOperation<T>,
+
+    /// The columns for accumulating the elliptic curve digests.
+    pub global_accumulation_cols: GlobalAccumulationOperation<T, 1>,
 }
 
 /// Chip that initializes memory that is provided from the program. The table is preprocessed and
@@ -61,7 +70,7 @@ impl MemoryProgramChip {
     }
 }
 
-impl<F: PrimeField> MachineAir<F> for MemoryProgramChip {
+impl<F: PrimeField32> MachineAir<F> for MemoryProgramChip {
     type Record = ExecutionRecord;
 
     type Program = Program;
@@ -107,8 +116,21 @@ impl<F: PrimeField> MachineAir<F> for MemoryProgramChip {
         Some(RowMajorMatrix::new(values, NUM_MEMORY_PROGRAM_PREPROCESSED_COLS))
     }
 
-    fn generate_dependencies(&self, _input: &ExecutionRecord, _output: &mut ExecutionRecord) {
-        // Do nothing since this chip has no dependencies.
+    fn generate_dependencies(&self, input: &ExecutionRecord, output: &mut ExecutionRecord) {
+        let program_memory = &input.program.memory_image;
+
+        let mult_bool = input.public_values.shard == 1;
+
+        let mut blu: HashMap<u32, HashMap<ByteLookupEvent, usize>> = HashMap::new();
+
+        program_memory.iter().for_each(|(&_addr, &word)| {
+            let mut row = [F::zero(); NUM_MEMORY_PROGRAM_MULT_COLS];
+            let cols: &mut MemoryProgramMultCols<F> = row.as_mut_slice().borrow_mut();
+            cols.global_interaction_cols
+                .populate_memory_range_check_witness(0, word, mult_bool, &mut blu);
+        });
+
+        output.add_sharded_byte_lookup_events(vec![&blu]);
     }
 
     fn generate_trace(
@@ -116,18 +138,26 @@ impl<F: PrimeField> MachineAir<F> for MemoryProgramChip {
         input: &ExecutionRecord,
         _output: &mut ExecutionRecord,
     ) -> RowMajorMatrix<F> {
-        let program_memory_addrs = input.program.memory_image.keys().copied().sorted();
+        let program_memory = &input.program.memory_image;
 
-        let mult = if input.public_values.shard == 1 { F::one() } else { F::zero() };
+        let mult_bool = input.public_values.shard == 1;
+        let mult = F::from_bool(mult_bool);
 
+        let mut global_cumulative_sum = SepticDigest::<F>::zero().0;
         // Generate the trace rows for each event.
-        let mut rows = program_memory_addrs
-            .into_iter()
-            .map(|_| {
+        let mut rows = program_memory
+            .iter()
+            .map(|(&addr, &word)| {
                 let mut row = [F::zero(); NUM_MEMORY_PROGRAM_MULT_COLS];
                 let cols: &mut MemoryProgramMultCols<F> = row.as_mut_slice().borrow_mut();
                 cols.multiplicity = mult;
                 cols.is_first_shard.populate(input.public_values.shard - 1);
+                cols.global_interaction_cols.populate_memory(0, 0, addr, word, false, mult_bool);
+                cols.global_accumulation_cols.populate(
+                    &mut global_cumulative_sum,
+                    [cols.global_interaction_cols],
+                    [cols.multiplicity],
+                );
                 row
             })
             .collect::<Vec<_>>();
@@ -140,15 +170,29 @@ impl<F: PrimeField> MachineAir<F> for MemoryProgramChip {
         );
 
         // Convert the trace to a row major matrix.
-
-        RowMajorMatrix::new(
+        let mut trace = RowMajorMatrix::new(
             rows.into_iter().flatten().collect::<Vec<_>>(),
             NUM_MEMORY_PROGRAM_MULT_COLS,
-        )
+        );
+
+        let len = input.program.memory_image.len();
+        for i in len..trace.height() {
+            let cols: &mut MemoryProgramMultCols<F> = trace.values
+                [i * NUM_MEMORY_PROGRAM_MULT_COLS..(i + 1) * NUM_MEMORY_PROGRAM_MULT_COLS]
+                .borrow_mut();
+            cols.global_interaction_cols.populate_dummy();
+            cols.global_accumulation_cols.populate(
+                &mut global_cumulative_sum,
+                [cols.global_interaction_cols],
+                [cols.multiplicity],
+            );
+        }
+
+        trace
     }
 
     fn included(&self, _: &Self::Record) -> bool {
-        true
+        false
     }
 
     fn commit_scope(&self) -> InteractionScope {
@@ -176,6 +220,9 @@ where
         let mult_local = main.row_slice(0);
         let mult_local: &MemoryProgramMultCols<AB::Var> = (*mult_local).borrow();
 
+        let mult_next = main.row_slice(1);
+        let mult_next: &MemoryProgramMultCols<AB::Var> = (*mult_next).borrow();
+
         // Get shard from public values and evaluate whether it is the first shard.
         let public_values_slice: [AB::Expr; SP1_PROOF_NUM_PV_ELTS] =
             core::array::from_fn(|i| builder.public_values()[i].into());
@@ -203,9 +250,24 @@ where
 
         let mut values = vec![AB::Expr::zero(), AB::Expr::zero(), prep_local.addr.into()];
         values.extend(prep_local.value.map(Into::into));
-        builder.send(
-            AirInteraction::new(values, mult_local.multiplicity.into(), InteractionKind::Memory),
-            InteractionScope::Global,
+        GlobalInteractionOperation::<AB::F>::eval_single_digest_memory(
+            builder,
+            AB::Expr::zero(),
+            AB::Expr::zero(),
+            prep_local.addr.into(),
+            prep_local.value.map(Into::into).0,
+            mult_local.global_interaction_cols,
+            false,
+            mult_local.multiplicity,
+        );
+
+        GlobalAccumulationOperation::<AB::F, 1>::eval_accumulation(
+            builder,
+            [mult_local.global_interaction_cols],
+            [mult_local.multiplicity],
+            [mult_next.multiplicity],
+            mult_local.global_accumulation_cols,
+            mult_next.global_accumulation_cols,
         );
     }
 }
diff --git a/crates/core/machine/src/operations/field/field_den.rs b/crates/core/machine/src/operations/field/field_den.rs
index b9bb80b306..10762c6574 100644
--- a/crates/core/machine/src/operations/field/field_den.rs
+++ b/crates/core/machine/src/operations/field/field_den.rs
@@ -153,9 +153,10 @@ mod tests {
         StarkGenericConfig,
     };
 
+    use crate::utils::uni_stark::{uni_stark_prove, uni_stark_verify};
+
     use super::{FieldDenCols, Limbs};
 
-    use crate::utils::{uni_stark_prove as prove, uni_stark_verify as verify};
     use core::{
         borrow::{Borrow, BorrowMut},
         mem::size_of,
@@ -287,9 +288,9 @@ mod tests {
         // This it to test that the proof DOESN'T work if messed up.
         // let row = trace.row_mut(0);
         // row[0] = BabyBear::from_canonical_u8(0);
-        let proof = prove::<BabyBearPoseidon2, _>(&config, &chip, &mut challenger, trace);
+        let proof = uni_stark_prove::<BabyBearPoseidon2, _>(&config, &chip, &mut challenger, trace);
 
         let mut challenger = config.challenger();
-        verify(&config, &chip, &mut challenger, &proof).unwrap();
+        uni_stark_verify(&config, &chip, &mut challenger, &proof).unwrap();
     }
 }
diff --git a/crates/core/machine/src/operations/field/field_inner_product.rs b/crates/core/machine/src/operations/field/field_inner_product.rs
index 30f2610e74..41221c54e9 100644
--- a/crates/core/machine/src/operations/field/field_inner_product.rs
+++ b/crates/core/machine/src/operations/field/field_inner_product.rs
@@ -143,7 +143,10 @@ mod tests {
 
     use super::{FieldInnerProductCols, Limbs};
 
-    use crate::utils::{pad_to_power_of_two, uni_stark_prove as prove, uni_stark_verify as verify};
+    use crate::utils::{
+        pad_to_power_of_two,
+        uni_stark::{uni_stark_prove, uni_stark_verify},
+    };
     use core::{
         borrow::{Borrow, BorrowMut},
         mem::size_of,
@@ -271,9 +274,9 @@ mod tests {
         let chip: FieldIpChip<Ed25519BaseField> = FieldIpChip::new();
         let trace: RowMajorMatrix<BabyBear> =
             chip.generate_trace(&shard, &mut ExecutionRecord::default());
-        let proof = prove::<BabyBearPoseidon2, _>(&config, &chip, &mut challenger, trace);
+        let proof = uni_stark_prove::<BabyBearPoseidon2, _>(&config, &chip, &mut challenger, trace);
 
         let mut challenger = config.challenger();
-        verify(&config, &chip, &mut challenger, &proof).unwrap();
+        uni_stark_verify(&config, &chip, &mut challenger, &proof).unwrap();
     }
 }
diff --git a/crates/core/machine/src/operations/field/field_op.rs b/crates/core/machine/src/operations/field/field_op.rs
index 9e16fe4db2..26d69da5a2 100644
--- a/crates/core/machine/src/operations/field/field_op.rs
+++ b/crates/core/machine/src/operations/field/field_op.rs
@@ -390,7 +390,8 @@ mod tests {
 
     use super::{FieldOpCols, FieldOperation, Limbs};
 
-    use crate::utils::{pad_to_power_of_two, uni_stark_prove as prove, uni_stark_verify as verify};
+    use crate::utils::pad_to_power_of_two;
+    use crate::utils::uni_stark::{uni_stark_prove, uni_stark_verify};
     use core::borrow::{Borrow, BorrowMut};
     use num::bigint::RandBigInt;
     use p3_air::Air;
@@ -535,10 +536,11 @@ mod tests {
             let shard = ExecutionRecord::default();
             let trace: RowMajorMatrix<BabyBear> =
                 chip.generate_trace(&shard, &mut ExecutionRecord::default());
-            let proof = prove::<BabyBearPoseidon2, _>(&config, &chip, &mut challenger, trace);
+            let proof =
+                uni_stark_prove::<BabyBearPoseidon2, _>(&config, &chip, &mut challenger, trace);
 
             let mut challenger = config.challenger();
-            verify(&config, &chip, &mut challenger, &proof).unwrap();
+            uni_stark_verify(&config, &chip, &mut challenger, &proof).unwrap();
         }
     }
 }
diff --git a/crates/core/machine/src/operations/field/field_sqrt.rs b/crates/core/machine/src/operations/field/field_sqrt.rs
index a0f40c6a48..238d6f07a6 100644
--- a/crates/core/machine/src/operations/field/field_sqrt.rs
+++ b/crates/core/machine/src/operations/field/field_sqrt.rs
@@ -152,7 +152,10 @@ mod tests {
     use sp1_curves::params::{FieldParameters, Limbs};
     use sp1_stark::air::{MachineAir, SP1AirBuilder};
 
-    use crate::utils::{pad_to_power_of_two, uni_stark_prove as prove, uni_stark_verify as verify};
+    use crate::utils::{
+        pad_to_power_of_two,
+        uni_stark::{uni_stark_prove, uni_stark_verify},
+    };
     use core::{
         borrow::{Borrow, BorrowMut},
         mem::size_of,
@@ -224,7 +227,11 @@ mod tests {
                     let mut row = [F::zero(); NUM_TEST_COLS];
                     let cols: &mut TestCols<F, P> = row.as_mut_slice().borrow_mut();
                     cols.a = P::to_limbs_field::<F, _>(a);
-                    cols.sqrt.populate(&mut blu_events, 1, a, ed25519_sqrt);
+                    cols.sqrt.populate(&mut blu_events, 1, a, |p| {
+                        ed25519_sqrt(p).expect(
+                            "By now we should have validated the sqrt exists, this is a bug",
+                        )
+                    });
                     output.add_byte_lookup_events(blu_events);
                     row
                 })
@@ -283,9 +290,9 @@ mod tests {
         let shard = ExecutionRecord::default();
         let trace: RowMajorMatrix<BabyBear> =
             chip.generate_trace(&shard, &mut ExecutionRecord::default());
-        let proof = prove::<BabyBearPoseidon2, _>(&config, &chip, &mut challenger, trace);
+        let proof = uni_stark_prove::<BabyBearPoseidon2, _>(&config, &chip, &mut challenger, trace);
 
         let mut challenger = config.challenger();
-        verify(&config, &chip, &mut challenger, &proof).unwrap();
+        uni_stark_verify(&config, &chip, &mut challenger, &proof).unwrap();
     }
 }
diff --git a/crates/core/machine/src/operations/global_accumulation.rs b/crates/core/machine/src/operations/global_accumulation.rs
new file mode 100644
index 0000000000..b041072e9e
--- /dev/null
+++ b/crates/core/machine/src/operations/global_accumulation.rs
@@ -0,0 +1,222 @@
+use crate::operations::GlobalInteractionOperation;
+use p3_air::AirBuilder;
+use p3_field::AbstractExtensionField;
+use p3_field::AbstractField;
+use p3_field::Field;
+use p3_field::PrimeField32;
+use sp1_derive::AlignedBorrow;
+use sp1_stark::air::BaseAirBuilder;
+use sp1_stark::air::SepticExtensionAirBuilder;
+use sp1_stark::septic_curve::SepticCurveComplete;
+use sp1_stark::{
+    air::SP1AirBuilder,
+    septic_curve::SepticCurve,
+    septic_digest::SepticDigest,
+    septic_extension::{SepticBlock, SepticExtension},
+};
+
+/// A set of columns needed to compute the global interaction elliptic curve digest.
+/// It is critical that this struct is at the end of the main trace, as the permutation constraints will be dependent on this fact.
+/// It is also critical the the cumulative sum is at the end of this struct, for the same reason.
+#[derive(AlignedBorrow, Debug, Clone, Copy)]
+#[repr(C)]
+pub struct GlobalAccumulationOperation<T, const N: usize> {
+    pub initial_digest: [SepticBlock<T>; 2],
+    pub sum_checker: [SepticBlock<T>; N],
+    pub cumulative_sum: [[SepticBlock<T>; 2]; N],
+}
+
+impl<T: Default, const N: usize> Default for GlobalAccumulationOperation<T, N> {
+    fn default() -> Self {
+        Self {
+            initial_digest: core::array::from_fn(|_| SepticBlock::<T>::default()),
+            sum_checker: core::array::from_fn(|_| SepticBlock::<T>::default()),
+            cumulative_sum: core::array::from_fn(|_| {
+                [SepticBlock::<T>::default(), SepticBlock::<T>::default()]
+            }),
+        }
+    }
+}
+
+impl<F: PrimeField32, const N: usize> GlobalAccumulationOperation<F, N> {
+    pub fn populate(
+        &mut self,
+        initial_digest: &mut SepticCurve<F>,
+        global_interaction_cols: [GlobalInteractionOperation<F>; N],
+        is_real: [F; N],
+    ) {
+        self.initial_digest[0] = SepticBlock::from(initial_digest.x.0);
+        self.initial_digest[1] = SepticBlock::from(initial_digest.y.0);
+
+        for i in 0..N {
+            let point_cur = SepticCurve {
+                x: SepticExtension(global_interaction_cols[i].x_coordinate.0),
+                y: SepticExtension(global_interaction_cols[i].y_coordinate.0),
+            };
+            assert!(is_real[i] == F::one() || is_real[i] == F::zero());
+            let sum_point = if is_real[i] == F::one() {
+                point_cur.add_incomplete(*initial_digest)
+            } else {
+                *initial_digest
+            };
+            let sum_checker = if is_real[i] == F::one() {
+                SepticExtension::<F>::zero()
+            } else {
+                SepticCurve::<F>::sum_checker_x(*initial_digest, point_cur, sum_point)
+            };
+            self.sum_checker[i] = SepticBlock::from(sum_checker.0);
+            self.cumulative_sum[i][0] = SepticBlock::from(sum_point.x.0);
+            self.cumulative_sum[i][1] = SepticBlock::from(sum_point.y.0);
+            *initial_digest = sum_point;
+        }
+    }
+
+    pub fn populate_dummy(
+        &mut self,
+        final_digest: SepticCurve<F>,
+        final_sum_checker: SepticExtension<F>,
+    ) {
+        self.initial_digest[0] = SepticBlock::from(final_digest.x.0);
+        self.initial_digest[1] = SepticBlock::from(final_digest.y.0);
+        for i in 0..N {
+            self.sum_checker[i] = SepticBlock::from(final_sum_checker.0);
+            self.cumulative_sum[i][0] = SepticBlock::from(final_digest.x.0);
+            self.cumulative_sum[i][1] = SepticBlock::from(final_digest.y.0);
+        }
+    }
+
+    pub fn populate_real(
+        &mut self,
+        sums: &[SepticCurveComplete<F>],
+        final_digest: SepticCurve<F>,
+        final_sum_checker: SepticExtension<F>,
+    ) {
+        let len = sums.len();
+        let sums = sums.iter().map(|complete_point| complete_point.point()).collect::<Vec<_>>();
+        self.initial_digest[0] = SepticBlock::from(sums[0].x.0);
+        self.initial_digest[1] = SepticBlock::from(sums[0].y.0);
+        for i in 0..N {
+            if len >= i + 2 {
+                self.sum_checker[i] = SepticBlock([F::zero(); 7]);
+                self.cumulative_sum[i][0] = SepticBlock::from(sums[i + 1].x.0);
+                self.cumulative_sum[i][1] = SepticBlock::from(sums[i + 1].y.0);
+            } else {
+                self.sum_checker[i] = SepticBlock::from(final_sum_checker.0);
+                self.cumulative_sum[i][0] = SepticBlock::from(final_digest.x.0);
+                self.cumulative_sum[i][1] = SepticBlock::from(final_digest.y.0);
+            }
+        }
+    }
+}
+
+impl<F: Field, const N: usize> GlobalAccumulationOperation<F, N> {
+    pub fn eval_accumulation<AB: SP1AirBuilder>(
+        builder: &mut AB,
+        global_interaction_cols: [GlobalInteractionOperation<AB::Var>; N],
+        local_is_real: [AB::Var; N],
+        next_is_real: [AB::Var; N],
+        local_accumulation: GlobalAccumulationOperation<AB::Var, N>,
+        next_accumulation: GlobalAccumulationOperation<AB::Var, N>,
+    ) {
+        // First, constrain the control flow regarding `is_real`.
+        // Constrain that all `is_real` values are boolean.
+        for i in 0..N {
+            builder.assert_bool(local_is_real[i]);
+        }
+
+        // Constrain that `is_real = 0` implies the next `is_real` values are all zero.
+        for i in 0..N - 1 {
+            // `is_real[i] == 0` implies `is_real[i + 1] == 0`.
+            builder.when_not(local_is_real[i]).assert_zero(local_is_real[i + 1]);
+        }
+
+        // Constrain that `is_real[N - 1] == 0` implies `next.is_real[0] == 0`
+        builder.when_transition().when_not(local_is_real[N - 1]).assert_zero(next_is_real[0]);
+
+        // Next, constrain the accumulation.
+        // Constrain that the first `initial_digest` is the starting point.
+        let initial_digest = SepticCurve::<AB::Expr> {
+            x: SepticExtension::<AB::Expr>::from_base_fn(|i| {
+                local_accumulation.initial_digest[0][i].into()
+            }),
+            y: SepticExtension::<AB::Expr>::from_base_fn(|i| {
+                local_accumulation.initial_digest[1][i].into()
+            }),
+        };
+
+        let ith_cumulative_sum = |idx: usize| SepticCurve::<AB::Expr> {
+            x: SepticExtension::<AB::Expr>::from_base_fn(|i| {
+                local_accumulation.cumulative_sum[idx][0].0[i].into()
+            }),
+            y: SepticExtension::<AB::Expr>::from_base_fn(|i| {
+                local_accumulation.cumulative_sum[idx][1].0[i].into()
+            }),
+        };
+
+        let ith_point_to_add = |idx: usize| SepticCurve::<AB::Expr> {
+            x: SepticExtension::<AB::Expr>::from_base_fn(|i| {
+                global_interaction_cols[idx].x_coordinate.0[i].into()
+            }),
+            y: SepticExtension::<AB::Expr>::from_base_fn(|i| {
+                global_interaction_cols[idx].y_coordinate.0[i].into()
+            }),
+        };
+
+        let starting_digest = SepticDigest::<AB::Expr>::zero().0;
+
+        builder.when_first_row().assert_septic_ext_eq(initial_digest.x.clone(), starting_digest.x);
+        builder.when_first_row().assert_septic_ext_eq(initial_digest.y.clone(), starting_digest.y);
+
+        // Constrain that when `is_real = 1`, addition is being carried out, and when `is_real = 0`, the sum remains the same.
+        for i in 0..N {
+            let current_sum =
+                if i == 0 { initial_digest.clone() } else { ith_cumulative_sum(i - 1) };
+            let point_to_add = ith_point_to_add(i);
+            let next_sum = ith_cumulative_sum(i);
+            // If `is_real == 1`, current_sum + point_to_add == next_sum must hold.
+            let sum_checker_x = SepticCurve::<AB::Expr>::sum_checker_x(
+                current_sum.clone(),
+                point_to_add.clone(),
+                next_sum.clone(),
+            );
+            let sum_checker_y = SepticCurve::<AB::Expr>::sum_checker_y(
+                current_sum.clone(),
+                point_to_add,
+                next_sum.clone(),
+            );
+            let witnessed_sum_checker_x = SepticExtension::<AB::Expr>::from_base_fn(|idx| {
+                local_accumulation.sum_checker[i].0[idx].into()
+            });
+            builder.assert_septic_ext_eq(sum_checker_x, witnessed_sum_checker_x.clone());
+            builder
+                .when(local_is_real[i])
+                .assert_septic_ext_eq(witnessed_sum_checker_x, SepticExtension::<AB::Expr>::zero());
+            builder
+                .when(local_is_real[i])
+                .assert_septic_ext_eq(sum_checker_y, SepticExtension::<AB::Expr>::zero());
+
+            // If `is_real == 0`, current_sum == next_sum must hold.
+            builder
+                .when_not(local_is_real[i])
+                .assert_septic_ext_eq(current_sum.x.clone(), next_sum.x.clone());
+            builder.when_not(local_is_real[i]).assert_septic_ext_eq(current_sum.y, next_sum.y);
+        }
+
+        // Constrain that the final digest is the next row's initial_digest.
+        let final_digest = ith_cumulative_sum(N - 1);
+
+        let next_initial_digest = SepticCurve::<AB::Expr> {
+            x: SepticExtension::<AB::Expr>::from_base_fn(|i| {
+                next_accumulation.initial_digest[0][i].into()
+            }),
+            y: SepticExtension::<AB::Expr>::from_base_fn(|i| {
+                next_accumulation.initial_digest[1][i].into()
+            }),
+        };
+
+        builder
+            .when_transition()
+            .assert_septic_ext_eq(final_digest.x.clone(), next_initial_digest.x.clone());
+        builder.when_transition().assert_septic_ext_eq(final_digest.y, next_initial_digest.y);
+    }
+}
diff --git a/crates/core/machine/src/operations/global_interaction.rs b/crates/core/machine/src/operations/global_interaction.rs
new file mode 100644
index 0000000000..5e9a7d789a
--- /dev/null
+++ b/crates/core/machine/src/operations/global_interaction.rs
@@ -0,0 +1,337 @@
+use crate::air::WordAirBuilder;
+use p3_air::AirBuilder;
+use p3_field::AbstractExtensionField;
+use p3_field::AbstractField;
+use p3_field::Field;
+use p3_field::PrimeField32;
+use sp1_core_executor::events::ByteRecord;
+use sp1_core_executor::ByteOpcode;
+use sp1_derive::AlignedBorrow;
+use sp1_stark::air::SepticExtensionAirBuilder;
+use sp1_stark::{
+    air::SP1AirBuilder,
+    septic_curve::{SepticCurve, CURVE_WITNESS_DUMMY_POINT_X, CURVE_WITNESS_DUMMY_POINT_Y},
+    septic_extension::{SepticBlock, SepticExtension},
+    InteractionKind,
+};
+
+/// A set of columns needed to compute the global interaction elliptic curve digest.
+#[derive(AlignedBorrow, Default, Debug, Clone, Copy)]
+#[repr(C)]
+pub struct GlobalInteractionOperation<T> {
+    pub offset_bits: [T; 8],
+    pub x_coordinate: SepticBlock<T>,
+    pub y_coordinate: SepticBlock<T>,
+    pub y6_bit_decomp: [T; 30],
+    pub range_check_witness: T,
+}
+
+impl<F: PrimeField32> GlobalInteractionOperation<F> {
+    pub fn get_digest(
+        values: SepticBlock<u32>,
+        is_receive: bool,
+        kind: InteractionKind,
+    ) -> (SepticCurve<F>, u8) {
+        let x_start = SepticExtension::<F>::from_base_fn(|i| F::from_canonical_u32(values.0[i]))
+            + SepticExtension::from_base(F::from_canonical_u32((kind as u32) << 24));
+        let (point, offset) = SepticCurve::<F>::lift_x(x_start);
+        if !is_receive {
+            return (point.neg(), offset);
+        }
+        (point, offset)
+    }
+
+    pub fn populate(
+        &mut self,
+        values: SepticBlock<u32>,
+        is_receive: bool,
+        is_real: bool,
+        kind: InteractionKind,
+    ) {
+        if is_real {
+            let (point, offset) = Self::get_digest(values, is_receive, kind);
+            for i in 0..8 {
+                self.offset_bits[i] = F::from_canonical_u8((offset >> i) & 1);
+            }
+            self.x_coordinate = SepticBlock::<F>::from(point.x.0);
+            self.y_coordinate = SepticBlock::<F>::from(point.y.0);
+            let range_check_value = if is_receive {
+                point.y.0[6].as_canonical_u32() - 1
+            } else {
+                point.y.0[6].as_canonical_u32() - (F::ORDER_U32 + 1) / 2
+            };
+            let mut top_4_bits = F::zero();
+            for i in 0..30 {
+                self.y6_bit_decomp[i] = F::from_canonical_u32((range_check_value >> i) & 1);
+                if i >= 26 {
+                    top_4_bits += self.y6_bit_decomp[i];
+                }
+            }
+            top_4_bits -= F::from_canonical_u32(4);
+            self.range_check_witness = top_4_bits.inverse();
+        } else {
+            self.populate_dummy();
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn populate_memory_range_check_witness(
+        &self,
+        shard: u32,
+        value: u32,
+        is_real: bool,
+        blu: &mut impl ByteRecord,
+    ) {
+        if is_real {
+            blu.add_u8_range_checks(shard, &value.to_le_bytes());
+            blu.add_u16_range_check(shard, shard as u16);
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn populate_memory(
+        &mut self,
+        shard: u32,
+        clk: u32,
+        addr: u32,
+        value: u32,
+        is_receive: bool,
+        is_real: bool,
+    ) {
+        self.populate(
+            SepticBlock([
+                shard,
+                clk,
+                addr,
+                value & 255,
+                (value >> 8) & 255,
+                (value >> 16) & 255,
+                (value >> 24) & 255,
+            ]),
+            is_receive,
+            is_real,
+            InteractionKind::Memory,
+        );
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn populate_syscall_range_check_witness(
+        &self,
+        shard: u32,
+        clk_16: u16,
+        clk_8: u8,
+        syscall_id: u32,
+        is_real: bool,
+        blu: &mut impl ByteRecord,
+    ) {
+        if is_real {
+            blu.add_u16_range_checks(shard, &[shard as u16, clk_16]);
+            blu.add_u8_range_checks(shard, &[clk_8, syscall_id as u8]);
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn populate_syscall(
+        &mut self,
+        shard: u32,
+        clk_16: u16,
+        clk_8: u8,
+        syscall_id: u32,
+        arg1: u32,
+        arg2: u32,
+        is_receive: bool,
+        is_real: bool,
+    ) {
+        self.populate(
+            SepticBlock([shard, clk_16.into(), clk_8.into(), syscall_id, arg1, arg2, 0]),
+            is_receive,
+            is_real,
+            InteractionKind::Syscall,
+        );
+    }
+
+    pub fn populate_dummy(&mut self) {
+        for i in 0..8 {
+            self.offset_bits[i] = F::zero();
+        }
+        self.x_coordinate = SepticBlock::<F>::from_base_fn(|i| {
+            F::from_canonical_u32(CURVE_WITNESS_DUMMY_POINT_X[i])
+        });
+        self.y_coordinate = SepticBlock::<F>::from_base_fn(|i| {
+            F::from_canonical_u32(CURVE_WITNESS_DUMMY_POINT_Y[i])
+        });
+        for i in 0..30 {
+            self.y6_bit_decomp[i] = F::zero();
+        }
+        self.range_check_witness = F::zero();
+    }
+}
+
+impl<F: Field> GlobalInteractionOperation<F> {
+    /// Constrain that the y coordinate is correct decompression, and send the resulting digest coordinate to the permutation trace.
+    /// The first value in `values` must be a value range checked to u16.
+    fn eval_single_digest<AB: SP1AirBuilder>(
+        builder: &mut AB,
+        values: [AB::Expr; 7],
+        cols: GlobalInteractionOperation<AB::Var>,
+        is_receive: bool,
+        is_real: AB::Var,
+        kind: InteractionKind,
+    ) {
+        // Constrain that the `is_real` is boolean.
+        builder.assert_bool(is_real);
+
+        // Compute the offset and range check each bits, ensuring that the offset is a byte.
+        let mut offset = AB::Expr::zero();
+        for i in 0..8 {
+            builder.assert_bool(cols.offset_bits[i]);
+            offset = offset.clone() + cols.offset_bits[i] * AB::F::from_canonical_u32(1 << i);
+        }
+
+        // Compute the message.
+        let message = SepticExtension(values)
+            + SepticExtension::<AB::Expr>::from_base(
+                offset * AB::F::from_canonical_u32(1 << 16)
+                    + AB::F::from_canonical_u32(kind as u32) * AB::F::from_canonical_u32(1 << 24),
+            );
+
+        // Compute a * m + b.
+        let am_plus_b = SepticCurve::<AB::Expr>::universal_hash(message);
+
+        let x = SepticExtension::<AB::Expr>::from_base_fn(|i| cols.x_coordinate[i].into());
+
+        // Constrain that when `is_real` is true, then `x == a * m + b`.
+        builder.when(is_real).assert_septic_ext_eq(x.clone(), am_plus_b);
+
+        // Constrain that y is a valid y-coordinate.
+        let y = SepticExtension::<AB::Expr>::from_base_fn(|i| cols.y_coordinate[i].into());
+
+        // Constrain that `(x, y)` is a valid point on the curve.
+        let y2 = y.square();
+        let x3_2x_26z5 = SepticCurve::<AB::Expr>::curve_formula(x);
+
+        builder.assert_septic_ext_eq(y2, x3_2x_26z5);
+
+        let mut y6_value = AB::Expr::zero();
+        let mut top_4_bits = AB::Expr::zero();
+        for i in 0..30 {
+            builder.assert_bool(cols.y6_bit_decomp[i]);
+            y6_value = y6_value.clone() + cols.y6_bit_decomp[i] * AB::F::from_canonical_u32(1 << i);
+            if i >= 26 {
+                top_4_bits = top_4_bits.clone() + cols.y6_bit_decomp[i];
+            }
+        }
+        top_4_bits = top_4_bits.clone() - AB::Expr::from_canonical_u32(4);
+        builder.when(is_real).assert_eq(cols.range_check_witness * top_4_bits, AB::Expr::one());
+
+        // Constrain that y has correct sign.
+        // If it's a receive: 0 <= y_6 - 1 < (p - 1) / 2 = 2^30 - 2^26
+        // If it's a send: 0 <= y_6 - (p + 1) / 2 < (p - 1) / 2 = 2^30 - 2^26
+        if is_receive {
+            builder.when(is_real).assert_eq(y.0[6].clone(), AB::Expr::one() + y6_value);
+        } else {
+            builder.when(is_real).assert_eq(
+                y.0[6].clone(),
+                AB::Expr::from_canonical_u32((1 << 30) - (1 << 26) + 1) + y6_value,
+            );
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn eval_single_digest_memory<AB: SP1AirBuilder>(
+        builder: &mut AB,
+        shard: AB::Expr,
+        clk: AB::Expr,
+        addr: AB::Expr,
+        value: [AB::Expr; 4],
+        cols: GlobalInteractionOperation<AB::Var>,
+        is_receive: bool,
+        is_real: AB::Var,
+    ) {
+        let values = [
+            shard.clone(),
+            clk.clone(),
+            addr.clone(),
+            value[0].clone(),
+            value[1].clone(),
+            value[2].clone(),
+            value[3].clone(),
+        ];
+
+        Self::eval_single_digest(
+            builder,
+            values,
+            cols,
+            is_receive,
+            is_real,
+            InteractionKind::Memory,
+        );
+
+        // Range check for message space.
+        // Range check shard to be a valid u16.
+        builder.send_byte(
+            AB::Expr::from_canonical_u8(ByteOpcode::U16Range as u8),
+            shard,
+            AB::Expr::zero(),
+            AB::Expr::zero(),
+            is_real,
+        );
+        // Range check the word value to be valid u8 word.
+        builder.slice_range_check_u8(&value, is_real);
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn eval_single_digest_syscall<AB: SP1AirBuilder>(
+        builder: &mut AB,
+        shard: AB::Expr,
+        clk_16: AB::Expr,
+        clk_8: AB::Expr,
+        syscall_id: AB::Expr,
+        arg1: AB::Expr,
+        arg2: AB::Expr,
+        cols: GlobalInteractionOperation<AB::Var>,
+        is_receive: bool,
+        is_real: AB::Var,
+    ) {
+        let values = [
+            shard.clone(),
+            clk_16.clone(),
+            clk_8.clone(),
+            syscall_id.clone(),
+            arg1.clone(),
+            arg2.clone(),
+            AB::Expr::zero(),
+        ];
+
+        Self::eval_single_digest(
+            builder,
+            values,
+            cols,
+            is_receive,
+            is_real,
+            InteractionKind::Syscall,
+        );
+
+        // Range check for message space.
+        // Range check shard to be a valid u16.
+        builder.send_byte(
+            AB::Expr::from_canonical_u8(ByteOpcode::U16Range as u8),
+            shard,
+            AB::Expr::zero(),
+            AB::Expr::zero(),
+            is_real,
+        );
+
+        // Range check clk_8 and syscall_id to be u8.
+        builder.slice_range_check_u8(&[clk_8, syscall_id], is_real);
+
+        // Range check clk_16 to be u16.
+        builder.send_byte(
+            AB::Expr::from_canonical_u8(ByteOpcode::U16Range as u8),
+            clk_16,
+            AB::Expr::zero(),
+            AB::Expr::zero(),
+            is_real,
+        );
+    }
+}
diff --git a/crates/core/machine/src/operations/mod.rs b/crates/core/machine/src/operations/mod.rs
index 394daf906b..4dfe2791ee 100644
--- a/crates/core/machine/src/operations/mod.rs
+++ b/crates/core/machine/src/operations/mod.rs
@@ -13,6 +13,8 @@ mod baby_bear_word;
 pub mod field;
 mod fixed_rotate_right;
 mod fixed_shift_right;
+mod global_accumulation;
+mod global_interaction;
 mod is_equal_word;
 mod is_zero;
 mod is_zero_word;
@@ -29,6 +31,8 @@ pub use baby_bear_range::*;
 pub use baby_bear_word::*;
 pub use fixed_rotate_right::*;
 pub use fixed_shift_right::*;
+pub use global_accumulation::*;
+pub use global_interaction::*;
 pub use is_equal_word::*;
 pub use is_zero::*;
 pub use is_zero_word::*;
diff --git a/crates/core/machine/src/riscv/cost.rs b/crates/core/machine/src/riscv/cost.rs
index dc19488d0d..b260954097 100644
--- a/crates/core/machine/src/riscv/cost.rs
+++ b/crates/core/machine/src/riscv/cost.rs
@@ -26,7 +26,7 @@ pub trait CostEstimator {
 impl CostEstimator for ExecutionReport {
     fn estimate_area(&self) -> u64 {
         let mut total_area = 0;
-        let mut total_chips = 3;
+        let mut total_chips = 2;
         let (chips, costs) = RiscvAir::<BabyBear>::get_chips_and_costs();
 
         let cpu_events = self.total_instruction_count();
diff --git a/crates/core/machine/src/riscv/mod.rs b/crates/core/machine/src/riscv/mod.rs
index 5964eba304..b544995d4a 100644
--- a/crates/core/machine/src/riscv/mod.rs
+++ b/crates/core/machine/src/riscv/mod.rs
@@ -9,9 +9,7 @@ use sp1_core_executor::{
 };
 
 use crate::{
-    memory::{
-        MemoryChipType, MemoryLocalChip, MemoryProgramChip, NUM_LOCAL_MEMORY_ENTRIES_PER_ROW,
-    },
+    memory::{MemoryChipType, MemoryLocalChip, NUM_LOCAL_MEMORY_ENTRIES_PER_ROW},
     riscv::MemoryChipType::{Finalize, Initialize},
     syscall::precompiles::fptower::{Fp2AddSubAssignChip, Fp2MulAssignChip, FpOpChip},
 };
@@ -96,7 +94,7 @@ pub enum RiscvAir<F: PrimeField32> {
     /// A table for the local memory state.
     MemoryLocal(MemoryLocalChip),
     /// A table for initializing the program memory.
-    ProgramMemory(MemoryProgramChip),
+    // ProgramMemory(MemoryProgramChip),
     /// A table for all the syscall invocations.
     SyscallCore(SyscallChip),
     /// A table for all the precompile invocations.
@@ -368,9 +366,9 @@ impl<F: PrimeField32> RiscvAir<F> {
         costs.insert(RiscvAirDiscriminants::MemoryLocal, memory_local.cost());
         chips.push(memory_local);
 
-        let memory_program = Chip::new(RiscvAir::ProgramMemory(MemoryProgramChip::default()));
-        costs.insert(RiscvAirDiscriminants::ProgramMemory, memory_program.cost());
-        chips.push(memory_program);
+        // let memory_program = Chip::new(RiscvAir::ProgramMemory(MemoryProgramChip::default()));
+        // costs.insert(RiscvAirDiscriminants::ProgramMemory, memory_program.cost());
+        // chips.push(memory_program);
 
         let byte = Chip::new(RiscvAir::ByteLookup(ByteChip::default()));
         costs.insert(RiscvAirDiscriminants::ByteLookup, byte.cost());
@@ -383,7 +381,7 @@ impl<F: PrimeField32> RiscvAir<F> {
     pub(crate) fn preprocessed_heights(program: &Program) -> Vec<(Self, usize)> {
         vec![
             (RiscvAir::Program(ProgramChip::default()), program.instructions.len()),
-            (RiscvAir::ProgramMemory(MemoryProgramChip::default()), program.memory_image.len()),
+            // (RiscvAir::ProgramMemory(MemoryProgramChip::default()), program.memory_image.len()),
             (RiscvAir::ByteLookup(ByteChip::default()), 1 << 16),
         ]
     }
@@ -461,7 +459,7 @@ impl<F: PrimeField32> RiscvAir<F> {
 
         // Remove the preprocessed chips.
         airs.remove(&Self::Program(ProgramChip::default()));
-        airs.remove(&Self::ProgramMemory(MemoryProgramChip::default()));
+        // airs.remove(&Self::ProgramMemory(MemoryProgramChip::default()));
         airs.remove(&Self::ByteLookup(ByteChip::default()));
 
         airs.into_iter()
@@ -524,7 +522,7 @@ impl<F: PrimeField32> RiscvAir<F> {
             Self::MemoryGlobalInit(_) => unreachable!("Invalid for memory init/final"),
             Self::MemoryGlobalFinal(_) => unreachable!("Invalid for memory init/final"),
             Self::MemoryLocal(_) => unreachable!("Invalid for memory local"),
-            Self::ProgramMemory(_) => unreachable!("Invalid for memory program"),
+            // Self::ProgramMemory(_) => unreachable!("Invalid for memory program"),
             Self::Program(_) => unreachable!("Invalid for core chip"),
             Self::Mul(_) => unreachable!("Invalid for core chip"),
             Self::Lt(_) => unreachable!("Invalid for core chip"),
@@ -579,26 +577,26 @@ pub mod tests {
     use crate::{
         io::SP1Stdin,
         riscv::RiscvAir,
-        utils,
-        utils::{prove, run_test, setup_logger},
+        utils::{self, prove_core, run_test, setup_logger},
     };
 
     use sp1_core_executor::{
         programs::tests::{
             fibonacci_program, simple_memory_program, simple_program, ssz_withdrawals_program,
         },
-        Instruction, Opcode, Program,
+        Instruction, Opcode, Program, SP1Context,
     };
     use sp1_stark::{
-        baby_bear_poseidon2::BabyBearPoseidon2, CpuProver, SP1CoreOpts, StarkProvingKey,
-        StarkVerifyingKey,
+        baby_bear_poseidon2::BabyBearPoseidon2, CpuProver, MachineProver, SP1CoreOpts,
+        StarkProvingKey, StarkVerifyingKey,
     };
 
     #[test]
     fn test_simple_prove() {
         utils::setup_logger();
         let program = simple_program();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
@@ -615,7 +613,8 @@ pub mod tests {
                     Instruction::new(*shift_op, 31, 29, 3, false, false),
                 ];
                 let program = Program::new(instructions, 0, 0);
-                run_test::<CpuProver<_, _>>(program).unwrap();
+                let stdin = SP1Stdin::new();
+                run_test::<CpuProver<_, _>>(program, stdin).unwrap();
             }
         }
     }
@@ -629,7 +628,8 @@ pub mod tests {
             Instruction::new(Opcode::SUB, 31, 30, 29, false, false),
         ];
         let program = Program::new(instructions, 0, 0);
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
@@ -641,7 +641,8 @@ pub mod tests {
             Instruction::new(Opcode::ADD, 31, 30, 29, false, false),
         ];
         let program = Program::new(instructions, 0, 0);
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
@@ -658,7 +659,8 @@ pub mod tests {
                     Instruction::new(*mul_op, 31, 30, 29, false, false),
                 ];
                 let program = Program::new(instructions, 0, 0);
-                run_test::<CpuProver<_, _>>(program).unwrap();
+                let stdin = SP1Stdin::new();
+                run_test::<CpuProver<_, _>>(program, stdin).unwrap();
             }
         }
     }
@@ -674,7 +676,8 @@ pub mod tests {
                 Instruction::new(*lt_op, 31, 30, 29, false, false),
             ];
             let program = Program::new(instructions, 0, 0);
-            run_test::<CpuProver<_, _>>(program).unwrap();
+            let stdin = SP1Stdin::new();
+            run_test::<CpuProver<_, _>>(program, stdin).unwrap();
         }
     }
 
@@ -690,7 +693,8 @@ pub mod tests {
                 Instruction::new(*bitwise_op, 31, 30, 29, false, false),
             ];
             let program = Program::new(instructions, 0, 0);
-            run_test::<CpuProver<_, _>>(program).unwrap();
+            let stdin = SP1Stdin::new();
+            run_test::<CpuProver<_, _>>(program, stdin).unwrap();
         }
     }
 
@@ -713,7 +717,8 @@ pub mod tests {
                     Instruction::new(*div_rem_op, 31, 29, 30, false, false),
                 ];
                 let program = Program::new(instructions, 0, 0);
-                run_test::<CpuProver<_, _>>(program).unwrap();
+                let stdin = SP1Stdin::new();
+                run_test::<CpuProver<_, _>>(program, stdin).unwrap();
             }
         }
     }
@@ -722,7 +727,8 @@ pub mod tests {
     fn test_fibonacci_prove_simple() {
         setup_logger();
         let program = fibonacci_program();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
@@ -734,7 +740,13 @@ pub mod tests {
         let mut opts = SP1CoreOpts::default();
         opts.shard_size = 1024;
         opts.shard_batch_size = 2;
-        prove::<_, CpuProver<_, _>>(program, &stdin, BabyBearPoseidon2::new(), opts, None).unwrap();
+
+        let config = BabyBearPoseidon2::new();
+        let machine = RiscvAir::machine(config);
+        let prover = CpuProver::new(machine);
+        let (pk, vk) = prover.setup(&program);
+        prove_core::<_, _>(&prover, &pk, &vk, program, &stdin, opts, SP1Context::default(), None)
+            .unwrap();
     }
 
     #[test]
@@ -742,28 +754,30 @@ pub mod tests {
         setup_logger();
         let program = fibonacci_program();
         let stdin = SP1Stdin::new();
-        prove::<_, CpuProver<_, _>>(
-            program,
-            &stdin,
-            BabyBearPoseidon2::new(),
-            SP1CoreOpts::default(),
-            None,
-        )
-        .unwrap();
+
+        let opts = SP1CoreOpts::default();
+        let config = BabyBearPoseidon2::new();
+        let machine = RiscvAir::machine(config);
+        let prover = CpuProver::new(machine);
+        let (pk, vk) = prover.setup(&program);
+        prove_core::<_, _>(&prover, &pk, &vk, program, &stdin, opts, SP1Context::default(), None)
+            .unwrap();
     }
 
     #[test]
     fn test_simple_memory_program_prove() {
         setup_logger();
         let program = simple_memory_program();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_ssz_withdrawal() {
         setup_logger();
         let program = ssz_withdrawals_program();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
diff --git a/crates/core/machine/src/riscv/shape.rs b/crates/core/machine/src/riscv/shape.rs
index 4ef0650043..9237cdfbfe 100644
--- a/crates/core/machine/src/riscv/shape.rs
+++ b/crates/core/machine/src/riscv/shape.rs
@@ -9,7 +9,7 @@ use sp1_stark::{air::MachineAir, MachineRecord, ProofShape};
 use thiserror::Error;
 
 use crate::{
-    memory::{MemoryLocalChip, MemoryProgramChip, NUM_LOCAL_MEMORY_ENTRIES_PER_ROW},
+    memory::{MemoryLocalChip, NUM_LOCAL_MEMORY_ENTRIES_PER_ROW},
     riscv::MemoryChipType::{Finalize, Initialize},
 };
 
@@ -37,6 +37,8 @@ pub enum CoreShapeError {
 /// A structure that enables fixing the shape of an executionrecord.
 pub struct CoreShapeConfig<F: PrimeField32> {
     included_shapes: Vec<HashMap<String, usize>>,
+    // Shapes for shards with a CPU chip and memory initialize/finalize events.
+    shapes_with_cpu_and_memory_finalize: Vec<HashMap<RiscvAir<F>, Vec<Option<usize>>>>,
     allowed_preprocessed_log_heights: HashMap<RiscvAir<F>, Vec<Option<usize>>>,
     allowed_core_log_heights: Vec<HashMap<RiscvAir<F>, Vec<Option<usize>>>>,
     maximal_core_log_heights_mask: Vec<bool>,
@@ -119,10 +121,21 @@ impl<F: PrimeField32> CoreShapeConfig<F> {
             // If cpu is included, try to fix the shape as a core.
 
             // Get the heights of the core airs in the record.
-            let heights = RiscvAir::<F>::core_heights(record);
+            let mut heights = RiscvAir::<F>::core_heights(record);
 
-            // Try to find a shape within the included shapes.
-            for (i, allowed_log_heights) in self.allowed_core_log_heights.iter().enumerate() {
+            let mut shape_candidates = self.allowed_core_log_heights.iter().collect::<Vec<_>>();
+
+            // If the record has global memory init/finalize events, replace the candidates with
+            // shapes that include the memory initialize/finalize chip.
+            if !record.global_memory_finalize_events.is_empty()
+                || !record.global_memory_initialize_events.is_empty()
+            {
+                heights.extend(RiscvAir::<F>::get_memory_init_final_heights(record));
+                shape_candidates = self.shapes_with_cpu_and_memory_finalize.iter().collect();
+            }
+
+            // Try to find a shape fitting within at least one of the candidate shapes.
+            for (i, allowed_log_heights) in shape_candidates.iter().enumerate() {
                 if let Some(shape) =
                     Self::find_shape_from_allowed_heights(&heights, allowed_log_heights)
                 {
@@ -151,7 +164,8 @@ impl<F: PrimeField32> CoreShapeConfig<F> {
             return Err(CoreShapeError::ShapeError(record.stats()));
         }
 
-        // If the record is a global memory init/finalize record, try to fix the shape as such.
+        // If the record is a does not have the CPU chip and is a global memory init/finalize
+        // record, try to fix the shape as such.
         if !record.global_memory_initialize_events.is_empty()
             || !record.global_memory_finalize_events.is_empty()
         {
@@ -204,7 +218,8 @@ impl<F: PrimeField32> CoreShapeConfig<F> {
         mem_events_per_row: usize,
         allowed_log_height: usize,
     ) -> Vec<[(String, usize); 3]> {
-        (1..=air.rows_per_event())
+        // TODO: this is a temporary fix to the shape, concretely fix this
+        (1..=4 * air.rows_per_event())
             .rev()
             .map(|rows_per_event| {
                 [
@@ -320,17 +335,38 @@ impl<F: PrimeField32> CoreShapeConfig<F> {
 
         max_core_shapes.collect()
     }
+    pub fn maximal_core_plus_precompile_shapes(&self) -> Vec<CoreShape> {
+        let max_preprocessed = self
+            .allowed_preprocessed_log_heights
+            .iter()
+            .map(|(air, allowed_heights)| (air.name(), allowed_heights.last().unwrap().unwrap()));
+
+        let precompile_only_shapes = self.precompile_allowed_log_heights.iter().flat_map(
+            move |(air, (mem_events_per_row, allowed_log_heights))| {
+                self.get_precompile_shapes(
+                    air,
+                    *mem_events_per_row,
+                    *allowed_log_heights.last().unwrap(),
+                )
+            },
+        );
+
+        let precompile_shapes = precompile_only_shapes
+            .map(|x| max_preprocessed.clone().chain(x).collect::<CoreShape>());
+
+        self.maximal_core_shapes().into_iter().chain(precompile_shapes).collect()
+    }
 }
 
 impl<F: PrimeField32> Default for CoreShapeConfig<F> {
     fn default() -> Self {
         // Preprocessed chip heights.
         let program_heights = vec![Some(19), Some(20), Some(21), Some(22)];
-        let program_memory_heights = vec![Some(19), Some(20), Some(21), Some(22)];
+        // let program_memory_heights = vec![Some(19), Some(20), Some(21), Some(22)];
 
         let allowed_preprocessed_log_heights = HashMap::from([
             (RiscvAir::Program(ProgramChip::default()), program_heights),
-            (RiscvAir::ProgramMemory(MemoryProgramChip::default()), program_memory_heights),
+            //    (RiscvAir::ProgramMemory(MemoryProgramChip::default()), program_memory_heights),
             (RiscvAir::ByteLookup(ByteChip::default()), vec![Some(16)]),
         ]);
 
@@ -626,6 +662,20 @@ impl<F: PrimeField32> Default for CoreShapeConfig<F> {
                 divrem_height: vec![Some(10), Some(16), Some(17)],
                 is_potentially_maximal: true,
             },
+            // Shards with mainly arithmetic, few memory accesses, and no division.
+            CoreShapeSpec {
+                cpu_height: vec![Some(21)],
+                add_sub_height: vec![Some(21)],
+                lt_height: vec![Some(19)],
+                bitwise_height: vec![Some(6)],
+                shift_right_height: vec![Some(19)],
+                shift_left_height: vec![Some(6)],
+                syscall_core_height: vec![Some(0)],
+                memory_local_height: vec![Some(6)],
+                mul_height: vec![Some(19)],
+                divrem_height: vec![Some(0)],
+                is_potentially_maximal: true,
+            },
             // Shards with basic arithmetic and branching.
             CoreShapeSpec {
                 cpu_height: vec![Some(21)],
@@ -692,6 +742,146 @@ impl<F: PrimeField32> Default for CoreShapeConfig<F> {
                 .insert(air, (mem_events_per_row, precompile_heights.clone()));
         }
 
+        // Shapes for shards with a CPU chip and memory initialize/finalize events.
+        let shapes_with_cpu_and_memory_finalize = vec![
+            // Small shape with few Muls and LTs.
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(13)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(12)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(4)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(10)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(10)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(6)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![None]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![None]),
+                (RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(15)]),
+            ]),
+            // Small shape with few Muls.
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(14)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(14)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(4)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(10)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(10)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(13)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(6)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![None]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![None]),
+                (RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(15)]),
+            ]),
+            // Small shape with many Muls.
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(15)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(14)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(12)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(12)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(10)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(12)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(7)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![None]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![None]),
+                (RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(15)]),
+            ]),
+            // Medium shape with few muls.
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(17)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(17)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(4)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(10)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(10)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(16)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(6)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![None]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![None]),
+                (RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(15)]),
+            ]),
+            // Medium shape with many Muls.
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(18)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(17)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(15)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(15)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(10)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(15)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(7)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![None]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![None]),
+                (RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(15)]),
+            ]),
+            // Large shapes
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(20)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(20)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(4)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(10)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(10)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(19)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(6)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![None]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![None]),
+                (RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(15)]),
+            ]),
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(20)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(20)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(4)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(10)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(19)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(6)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![Some(2)]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![Some(2)]),
+                (RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(15)]),
+            ]),
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(21)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(21)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(11)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(19)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(19)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(10)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(19)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(7)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![None]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![None]),
+                (RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)), vec![Some(8)]),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(15)]),
+            ]),
+            // Catchall shape.
+            HashMap::from([
+                (RiscvAir::<F>::Cpu(CpuChip::default()), vec![Some(21)]),
+                (RiscvAir::<F>::Add(AddSubChip::default()), vec![Some(21)]),
+                (RiscvAir::<F>::Bitwise(BitwiseChip::default()), vec![Some(19)]),
+                (RiscvAir::<F>::Mul(MulChip::default()), vec![Some(19)]),
+                (RiscvAir::<F>::ShiftRight(ShiftRightChip::default()), vec![Some(19)]),
+                (RiscvAir::<F>::ShiftLeft(ShiftLeft::default()), vec![Some(19)]),
+                (RiscvAir::<F>::Lt(LtChip::default()), vec![Some(20)]),
+                (RiscvAir::<F>::MemoryLocal(MemoryLocalChip::new()), vec![Some(19)]),
+                (RiscvAir::<F>::SyscallCore(SyscallChip::core()), vec![Some(19)]),
+                (RiscvAir::<F>::DivRem(DivRemChip::default()), vec![Some(21)]),
+                (
+                    RiscvAir::<F>::MemoryGlobalInit(MemoryGlobalChip::new(Initialize)),
+                    vec![Some(19)],
+                ),
+                (RiscvAir::<F>::MemoryGlobalFinal(MemoryGlobalChip::new(Finalize)), vec![Some(19)]),
+            ]),
+        ];
+
         Self {
             included_shapes: vec![],
             allowed_preprocessed_log_heights,
@@ -699,6 +889,7 @@ impl<F: PrimeField32> Default for CoreShapeConfig<F> {
             maximal_core_log_heights_mask,
             memory_allowed_log_heights,
             precompile_allowed_log_heights,
+            shapes_with_cpu_and_memory_finalize,
         }
     }
 }
@@ -707,8 +898,7 @@ impl<F: PrimeField32> Default for CoreShapeConfig<F> {
 pub mod tests {
     use std::fmt::Debug;
 
-    use p3_challenger::{CanObserve, FieldChallenger};
-    use sp1_stark::{air::InteractionScope, Dom, MachineProver, StarkGenericConfig};
+    use sp1_stark::{Dom, MachineProver, StarkGenericConfig};
 
     use super::*;
 
@@ -729,30 +919,15 @@ pub mod tests {
         let (pk, _) = prover.setup(&program);
 
         // Try to generate traces.
-        let global_traces = prover.generate_traces(&record, InteractionScope::Global);
-        let local_traces = prover.generate_traces(&record, InteractionScope::Local);
+        let main_traces = prover.generate_traces(&record);
 
         // Try to commit the traces.
-        let global_data = prover.commit(&record, global_traces);
-        let local_data = prover.commit(&record, local_traces);
+        let main_data = prover.commit(&record, main_traces);
 
         let mut challenger = prover.machine().config().challenger();
-        challenger.observe(global_data.main_commit.clone());
-        challenger.observe(local_data.main_commit.clone());
-
-        let global_permutation_challenges: [<SC as StarkGenericConfig>::Challenge; 2] =
-            [challenger.sample_ext_element(), challenger.sample_ext_element()];
 
         // Try to "open".
-        prover
-            .open(
-                &pk,
-                Some(global_data),
-                local_data,
-                &mut challenger,
-                &global_permutation_challenges,
-            )
-            .unwrap();
+        prover.open(&pk, main_data, &mut challenger).unwrap();
     }
 
     #[test]
@@ -769,8 +944,7 @@ pub mod tests {
     fn test_dummy_record() {
         use crate::utils::setup_logger;
         use p3_baby_bear::BabyBear;
-        use sp1_stark::baby_bear_poseidon2::BabyBearPoseidon2;
-        use sp1_stark::CpuProver;
+        use sp1_stark::{baby_bear_poseidon2::BabyBearPoseidon2, CpuProver};
 
         type SC = BabyBearPoseidon2;
         type A = RiscvAir<BabyBear>;
@@ -779,7 +953,7 @@ pub mod tests {
 
         let preprocessed_log_heights = [
             (RiscvAir::<BabyBear>::Program(ProgramChip::default()), 10),
-            (RiscvAir::<BabyBear>::ProgramMemory(MemoryProgramChip::default()), 10),
+            //    (RiscvAir::<BabyBear>::ProgramMemory(MemoryProgramChip::default()), 10),
             (RiscvAir::<BabyBear>::ByteLookup(ByteChip::default()), 16),
         ];
 
diff --git a/crates/core/machine/src/sys.rs b/crates/core/machine/src/sys.rs
new file mode 100644
index 0000000000..aa14f63d71
--- /dev/null
+++ b/crates/core/machine/src/sys.rs
@@ -0,0 +1,163 @@
+use crate::{
+    alu::{AddSubCols, BitwiseCols, LtCols, MulCols, ShiftLeftCols, ShiftRightCols},
+    memory::MemoryInitCols,
+    memory::SingleMemoryLocal,
+    syscall::chip::SyscallCols,
+};
+use hashbrown::HashMap;
+use p3_baby_bear::BabyBear;
+
+use sp1_core_executor::events::{
+    AluEvent, CpuEvent, LookupId, MemoryInitializeFinalizeEvent, MemoryLocalEvent,
+    MemoryReadRecord, MemoryRecordEnum, MemoryWriteRecord, SyscallEvent,
+};
+
+#[link(name = "sp1-core-machine-sys", kind = "static")]
+extern "C-unwind" {
+    pub fn add_sub_event_to_row_babybear(event: &AluEvent, cols: &mut AddSubCols<BabyBear>);
+    pub fn mul_event_to_row_babybear(event: &AluEvent, cols: &mut MulCols<BabyBear>);
+    pub fn bitwise_event_to_row_babybear(event: &AluEvent, cols: &mut BitwiseCols<BabyBear>);
+    pub fn lt_event_to_row_babybear(event: &AluEvent, cols: &mut LtCols<BabyBear>);
+    pub fn sll_event_to_row_babybear(event: &AluEvent, cols: &mut ShiftLeftCols<BabyBear>);
+    pub fn sr_event_to_row_babybear(event: &AluEvent, cols: &mut ShiftRightCols<BabyBear>);
+    pub fn memory_local_event_to_row_babybear(
+        event: &MemoryLocalEvent,
+        cols: &mut SingleMemoryLocal<BabyBear>,
+    );
+    pub fn memory_global_event_to_row_babybear(
+        event: &MemoryInitializeFinalizeEvent,
+        is_receive: bool,
+        cols: &mut MemoryInitCols<BabyBear>,
+    );
+    pub fn syscall_event_to_row_babybear(
+        event: &SyscallEvent,
+        is_receive: bool,
+        cols: &mut SyscallCols<BabyBear>,
+    );
+}
+
+/// An alternative to `Option<MemoryRecordEnum>` that is FFI-safe.
+///
+/// See [`MemoryRecordEnum`].
+#[derive(Debug, Copy, Clone)]
+#[repr(C)]
+pub enum OptionMemoryRecordEnum {
+    /// Read.
+    Read(MemoryReadRecord),
+    /// Write.
+    Write(MemoryWriteRecord),
+    None,
+}
+
+impl From<Option<MemoryRecordEnum>> for OptionMemoryRecordEnum {
+    fn from(value: Option<MemoryRecordEnum>) -> Self {
+        match value {
+            Some(MemoryRecordEnum::Read(r)) => Self::Read(r),
+            Some(MemoryRecordEnum::Write(r)) => Self::Write(r),
+            None => Self::None,
+        }
+    }
+}
+
+impl From<OptionMemoryRecordEnum> for Option<MemoryRecordEnum> {
+    fn from(value: OptionMemoryRecordEnum) -> Self {
+        match value {
+            OptionMemoryRecordEnum::Read(r) => Some(MemoryRecordEnum::Read(r)),
+            OptionMemoryRecordEnum::Write(r) => Some(MemoryRecordEnum::Write(r)),
+            OptionMemoryRecordEnum::None => None,
+        }
+    }
+}
+
+/// An FFI-safe version of [`CpuEvent`] that also looks up nonces ahead of time.
+#[derive(Debug, Clone, Copy)]
+#[repr(C)]
+pub struct CpuEventFfi {
+    /// The clock cycle.
+    pub clk: u32,
+    /// The program counter.
+    pub pc: u32,
+    /// The next program counter.
+    pub next_pc: u32,
+    /// The first operand.
+    pub a: u32,
+    /// The first operand memory record.
+    pub a_record: OptionMemoryRecordEnum,
+    /// The second operand.
+    pub b: u32,
+    /// The second operand memory record.
+    pub b_record: OptionMemoryRecordEnum,
+    /// The third operand.
+    pub c: u32,
+    /// The third operand memory record.
+    pub c_record: OptionMemoryRecordEnum,
+    // Seems to be vestigial. Verify before completely removing this.
+    // /// The memory value.
+    // pub memory: Option<&'a u32>,
+    /// The memory record.
+    pub memory_record: OptionMemoryRecordEnum,
+    /// The exit code.
+    pub exit_code: u32,
+
+    pub alu_nonce: u32,
+    pub syscall_nonce: u32,
+    pub memory_add_nonce: u32,
+    pub memory_sub_nonce: u32,
+    pub branch_gt_nonce: u32,
+    pub branch_lt_nonce: u32,
+    pub branch_add_nonce: u32,
+    pub jump_jal_nonce: u32,
+    pub jump_jalr_nonce: u32,
+    pub auipc_nonce: u32,
+}
+
+impl CpuEventFfi {
+    pub fn new(event: &CpuEvent, nonce_lookup: &HashMap<LookupId, u32>) -> Self {
+        let &CpuEvent {
+            clk,
+            pc,
+            next_pc,
+            a,
+            a_record,
+            b,
+            b_record,
+            c,
+            c_record,
+            memory_record,
+            exit_code,
+            ref alu_lookup_id,
+            ref syscall_lookup_id,
+            ref memory_add_lookup_id,
+            ref memory_sub_lookup_id,
+            ref branch_gt_lookup_id,
+            ref branch_lt_lookup_id,
+            ref branch_add_lookup_id,
+            ref jump_jal_lookup_id,
+            ref jump_jalr_lookup_id,
+            ref auipc_lookup_id,
+        } = event;
+        Self {
+            clk,
+            pc,
+            next_pc,
+            a,
+            a_record: a_record.into(),
+            b,
+            b_record: b_record.into(),
+            c,
+            c_record: c_record.into(),
+            memory_record: memory_record.into(),
+            exit_code,
+            alu_nonce: nonce_lookup.get(alu_lookup_id).copied().unwrap_or_default(),
+            syscall_nonce: nonce_lookup.get(syscall_lookup_id).copied().unwrap_or_default(),
+            memory_add_nonce: nonce_lookup.get(memory_add_lookup_id).copied().unwrap_or_default(),
+            memory_sub_nonce: nonce_lookup.get(memory_sub_lookup_id).copied().unwrap_or_default(),
+            branch_gt_nonce: nonce_lookup.get(branch_gt_lookup_id).copied().unwrap_or_default(),
+            branch_lt_nonce: nonce_lookup.get(branch_lt_lookup_id).copied().unwrap_or_default(),
+            branch_add_nonce: nonce_lookup.get(branch_add_lookup_id).copied().unwrap_or_default(),
+            jump_jal_nonce: nonce_lookup.get(jump_jal_lookup_id).copied().unwrap_or_default(),
+            jump_jalr_nonce: nonce_lookup.get(jump_jalr_lookup_id).copied().unwrap_or_default(),
+            auipc_nonce: nonce_lookup.get(auipc_lookup_id).copied().unwrap_or_default(),
+        }
+    }
+}
diff --git a/crates/core/machine/src/syscall/chip.rs b/crates/core/machine/src/syscall/chip.rs
index 00257d46aa..6ced3d79d8 100644
--- a/crates/core/machine/src/syscall/chip.rs
+++ b/crates/core/machine/src/syscall/chip.rs
@@ -1,18 +1,28 @@
-use core::fmt;
-use std::{
-    borrow::{Borrow, BorrowMut},
-    mem::size_of,
+use crate::{
+    operations::GlobalAccumulationOperation, operations::GlobalInteractionOperation,
+    utils::pad_rows_fixed,
 };
-
+use core::fmt;
+use hashbrown::HashMap;
+use itertools::Itertools;
 use p3_air::{Air, BaseAir};
+use p3_field::AbstractField;
 use p3_field::PrimeField32;
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
+use p3_maybe_rayon::prelude::IntoParallelRefIterator;
+use p3_maybe_rayon::prelude::ParallelBridge;
+use p3_maybe_rayon::prelude::ParallelIterator;
+use p3_maybe_rayon::prelude::ParallelSlice;
+use sp1_core_executor::events::ByteLookupEvent;
+use sp1_core_executor::events::ByteRecord;
 use sp1_core_executor::{events::SyscallEvent, ExecutionRecord, Program};
 use sp1_derive::AlignedBorrow;
 use sp1_stark::air::{InteractionScope, MachineAir, SP1AirBuilder};
-
-use crate::utils::pad_rows_fixed;
-
+use sp1_stark::septic_digest::SepticDigest;
+use std::{
+    borrow::{Borrow, BorrowMut},
+    mem::size_of,
+};
 /// The number of main trace columns for `SyscallChip`.
 pub const NUM_SYSCALL_COLS: usize = size_of::<SyscallCols<u8>>();
 
@@ -39,8 +49,14 @@ impl SyscallChip {
     pub const fn precompile() -> Self {
         Self::new(SyscallShardKind::Precompile)
     }
+
+    pub fn shard_kind(&self) -> SyscallShardKind {
+        self.shard_kind
+    }
 }
 
+pub const SYSCALL_INITIAL_DIGEST_POS_COPY: usize = 60;
+
 /// The column layout for the chip.
 #[derive(AlignedBorrow, Default, Clone, Copy)]
 #[repr(C)]
@@ -48,10 +64,11 @@ pub struct SyscallCols<T> {
     /// The shard number of the syscall.
     pub shard: T,
 
-    /// The clk of the syscall.
-    pub clk: T,
+    /// The bottom 16 bits of clk of the syscall.
+    pub clk_16: T,
 
-    pub nonce: T,
+    /// The top 8 bits of clk of the syscall.
+    pub clk_8: T,
 
     /// The syscall_id of the syscall.
     pub syscall_id: T,
@@ -63,6 +80,12 @@ pub struct SyscallCols<T> {
     pub arg2: T,
 
     pub is_real: T,
+
+    /// The global interaction columns.
+    pub global_interaction_cols: GlobalInteractionOperation<T>,
+
+    /// The columns for accumulating the elliptic curve digests.
+    pub global_accumulation_cols: GlobalAccumulationOperation<T, 1>,
 }
 
 impl<F: PrimeField32> MachineAir<F> for SyscallChip {
@@ -74,8 +97,38 @@ impl<F: PrimeField32> MachineAir<F> for SyscallChip {
         format!("Syscall{}", self.shard_kind).to_string()
     }
 
-    fn generate_dependencies(&self, _input: &ExecutionRecord, _output: &mut ExecutionRecord) {
-        // Do nothing since this chip has no dependencies.
+    fn generate_dependencies(&self, input: &ExecutionRecord, output: &mut ExecutionRecord) {
+        let events = match self.shard_kind {
+            SyscallShardKind::Core => &input.syscall_events,
+            SyscallShardKind::Precompile => &input
+                .precompile_events
+                .all_events()
+                .map(|(event, _)| event.to_owned())
+                .collect::<Vec<_>>(),
+        };
+        let chunk_size = std::cmp::max(events.len() / num_cpus::get(), 1);
+        let blu_batches = events
+            .par_chunks(chunk_size)
+            .map(|events| {
+                let mut blu: HashMap<u32, HashMap<ByteLookupEvent, usize>> = HashMap::new();
+                events.iter().for_each(|event| {
+                    let mut row = [F::zero(); NUM_SYSCALL_COLS];
+                    let cols: &mut SyscallCols<F> = row.as_mut_slice().borrow_mut();
+                    let clk_16 = (event.clk & 65535) as u16;
+                    let clk_8 = (event.clk >> 16) as u8;
+                    cols.global_interaction_cols.populate_syscall_range_check_witness(
+                        event.shard,
+                        clk_16,
+                        clk_8,
+                        event.syscall_id,
+                        true,
+                        &mut blu,
+                    );
+                });
+                blu
+            })
+            .collect::<Vec<_>>();
+        output.add_sharded_byte_lookup_events(blu_batches.iter().collect_vec());
     }
 
     fn generate_trace(
@@ -83,37 +136,62 @@ impl<F: PrimeField32> MachineAir<F> for SyscallChip {
         input: &ExecutionRecord,
         _output: &mut ExecutionRecord,
     ) -> RowMajorMatrix<F> {
-        let mut rows = Vec::new();
+        let mut global_cumulative_sum = SepticDigest::<F>::zero().0;
 
-        let row_fn = |syscall_event: &SyscallEvent| {
+        let row_fn = |syscall_event: &SyscallEvent, is_receive: bool| {
             let mut row = [F::zero(); NUM_SYSCALL_COLS];
             let cols: &mut SyscallCols<F> = row.as_mut_slice().borrow_mut();
 
+            debug_assert!(syscall_event.clk < (1 << 24));
+            let clk_16 = (syscall_event.clk & 65535) as u16;
+            let clk_8 = (syscall_event.clk >> 16) as u8;
+
             cols.shard = F::from_canonical_u32(syscall_event.shard);
-            cols.clk = F::from_canonical_u32(syscall_event.clk);
+            cols.clk_16 = F::from_canonical_u16(clk_16);
+            cols.clk_8 = F::from_canonical_u8(clk_8);
             cols.syscall_id = F::from_canonical_u32(syscall_event.syscall_id);
-            cols.nonce = F::from_canonical_u32(syscall_event.nonce);
             cols.arg1 = F::from_canonical_u32(syscall_event.arg1);
             cols.arg2 = F::from_canonical_u32(syscall_event.arg2);
             cols.is_real = F::one();
+            cols.global_interaction_cols.populate_syscall(
+                syscall_event.shard,
+                clk_16,
+                clk_8,
+                syscall_event.syscall_id,
+                syscall_event.arg1,
+                syscall_event.arg2,
+                is_receive,
+                true,
+            );
             row
         };
 
-        match self.shard_kind {
-            SyscallShardKind::Core => {
-                for event in input.syscall_events.iter() {
-                    let row = row_fn(event);
-                    rows.push(row);
-                }
-            }
-            SyscallShardKind::Precompile => {
-                for event in input.precompile_events.all_events().map(|(event, _)| event) {
-                    let row = row_fn(event);
-                    rows.push(row);
-                }
-            }
+        let mut rows = match self.shard_kind {
+            SyscallShardKind::Core => input
+                .syscall_events
+                .par_iter()
+                .map(|event| row_fn(event, false))
+                .collect::<Vec<_>>(),
+            SyscallShardKind::Precompile => input
+                .precompile_events
+                .all_events()
+                .map(|(event, _)| event)
+                .par_bridge()
+                .map(|event| row_fn(event, true))
+                .collect::<Vec<_>>(),
         };
 
+        let num_events = rows.len();
+
+        for i in 0..num_events {
+            let cols: &mut SyscallCols<F> = rows[i].as_mut_slice().borrow_mut();
+            cols.global_accumulation_cols.populate(
+                &mut global_cumulative_sum,
+                [cols.global_interaction_cols],
+                [cols.is_real],
+            );
+        }
+
         // Pad the trace to a power of two depending on the proof shape in `input`.
         pad_rows_fixed(
             &mut rows,
@@ -121,7 +199,21 @@ impl<F: PrimeField32> MachineAir<F> for SyscallChip {
             input.fixed_log2_rows::<F, _>(self),
         );
 
-        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_SYSCALL_COLS)
+        let mut trace =
+            RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_SYSCALL_COLS);
+
+        for i in num_events..trace.height() {
+            let cols: &mut SyscallCols<F> =
+                trace.values[i * NUM_SYSCALL_COLS..(i + 1) * NUM_SYSCALL_COLS].borrow_mut();
+            cols.global_interaction_cols.populate_dummy();
+            cols.global_accumulation_cols.populate(
+                &mut global_cumulative_sum,
+                [cols.global_interaction_cols],
+                [cols.is_real],
+            );
+        }
+
+        trace
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -153,6 +245,8 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &SyscallCols<AB::Var> = (*local).borrow();
+        let next = main.row_slice(1);
+        let next: &SyscallCols<AB::Var> = (*next).borrow();
 
         builder.assert_eq(
             local.is_real * local.is_real * local.is_real,
@@ -163,8 +257,7 @@ where
             SyscallShardKind::Core => {
                 builder.receive_syscall(
                     local.shard,
-                    local.clk,
-                    local.nonce,
+                    local.clk_16 + local.clk_8 * AB::Expr::from_canonical_u32(1 << 16),
                     local.syscall_id,
                     local.arg1,
                     local.arg2,
@@ -173,22 +266,23 @@ where
                 );
 
                 // Send the call to the global bus to/from the precompile chips.
-                builder.send_syscall(
-                    local.shard,
-                    local.clk,
-                    local.nonce,
-                    local.syscall_id,
-                    local.arg1,
-                    local.arg2,
+                GlobalInteractionOperation::<AB::F>::eval_single_digest_syscall(
+                    builder,
+                    local.shard.into(),
+                    local.clk_16.into(),
+                    local.clk_8.into(),
+                    local.syscall_id.into(),
+                    local.arg1.into(),
+                    local.arg2.into(),
+                    local.global_interaction_cols,
+                    false,
                     local.is_real,
-                    InteractionScope::Global,
                 );
             }
             SyscallShardKind::Precompile => {
                 builder.send_syscall(
                     local.shard,
-                    local.clk,
-                    local.nonce,
+                    local.clk_16 + local.clk_8 * AB::Expr::from_canonical_u32(1 << 16),
                     local.syscall_id,
                     local.arg1,
                     local.arg2,
@@ -196,19 +290,29 @@ where
                     InteractionScope::Local,
                 );
 
-                // Send the call to the global bus to/from the precompile chips.
-                builder.receive_syscall(
-                    local.shard,
-                    local.clk,
-                    local.nonce,
-                    local.syscall_id,
-                    local.arg1,
-                    local.arg2,
+                GlobalInteractionOperation::<AB::F>::eval_single_digest_syscall(
+                    builder,
+                    local.shard.into(),
+                    local.clk_16.into(),
+                    local.clk_8.into(),
+                    local.syscall_id.into(),
+                    local.arg1.into(),
+                    local.arg2.into(),
+                    local.global_interaction_cols,
+                    true,
                     local.is_real,
-                    InteractionScope::Global,
                 );
             }
         }
+
+        GlobalAccumulationOperation::<AB::F, 1>::eval_accumulation(
+            builder,
+            [local.global_interaction_cols],
+            [local.is_real],
+            [next.is_real],
+            local.global_accumulation_cols,
+            next.global_accumulation_cols,
+        );
     }
 }
 
diff --git a/crates/core/machine/src/syscall/precompiles/edwards/ed_add.rs b/crates/core/machine/src/syscall/precompiles/edwards/ed_add.rs
index 7488a772c3..d29e3c0488 100644
--- a/crates/core/machine/src/syscall/precompiles/edwards/ed_add.rs
+++ b/crates/core/machine/src/syscall/precompiles/edwards/ed_add.rs
@@ -9,7 +9,7 @@ use itertools::Itertools;
 use num::{BigUint, Zero};
 
 use crate::air::MemoryAirBuilder;
-use p3_air::{Air, AirBuilder, BaseAir};
+use p3_air::{Air, BaseAir};
 use p3_field::{AbstractField, PrimeField32};
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use p3_maybe_rayon::prelude::{IntoParallelRefIterator, ParallelIterator, ParallelSlice};
@@ -45,7 +45,6 @@ pub struct EdAddAssignCols<T> {
     pub is_real: T,
     pub shard: T,
     pub clk: T,
-    pub nonce: T,
     pub p_ptr: T,
     pub q_ptr: T,
     pub p_access: [MemoryWriteCols<T>; WORDS_CURVE_POINT],
@@ -158,17 +157,7 @@ impl<F: PrimeField32, E: EllipticCurve + EdwardsParameters> MachineAir<F> for Ed
         );
 
         // Convert the trace to a row major matrix.
-        let mut trace =
-            RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_ED_ADD_COLS);
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut EdAddAssignCols<F> =
-                trace.values[i * NUM_ED_ADD_COLS..(i + 1) * NUM_ED_ADD_COLS].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_ED_ADD_COLS)
     }
 
     fn generate_dependencies(&self, input: &Self::Record, output: &mut Self::Record) {
@@ -204,6 +193,10 @@ impl<F: PrimeField32, E: EllipticCurve + EdwardsParameters> MachineAir<F> for Ed
             !shard.get_precompile_events(SyscallCode::ED_ADD).is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<E: EllipticCurve + EdwardsParameters> EdAddAssignChip<E> {
@@ -255,12 +248,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &EdAddAssignCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &EdAddAssignCols<AB::Var> = (*next).borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         let x1: Limbs<AB::Var, <Ed25519BaseField as NumLimbs>::Limbs> =
             limbs_from_prev_access(&local.p_access[0..8]);
@@ -328,7 +315,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             AB::F::from_canonical_u32(SyscallCode::ED_ADD.syscall_id()),
             local.p_ptr,
             local.q_ptr,
@@ -344,19 +330,21 @@ mod tests {
     use sp1_stark::CpuProver;
     use test_artifacts::{ED25519_ELF, ED_ADD_ELF};
 
-    use crate::utils;
+    use crate::{io::SP1Stdin, utils};
 
     #[test]
     fn test_ed_add_simple() {
         utils::setup_logger();
         let program = Program::from(ED_ADD_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_ed25519_program() {
         utils::setup_logger();
         let program = Program::from(ED25519_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 }
diff --git a/crates/core/machine/src/syscall/precompiles/edwards/ed_decompress.rs b/crates/core/machine/src/syscall/precompiles/edwards/ed_decompress.rs
index 51b3f32e59..34a3c9283f 100644
--- a/crates/core/machine/src/syscall/precompiles/edwards/ed_decompress.rs
+++ b/crates/core/machine/src/syscall/precompiles/edwards/ed_decompress.rs
@@ -45,7 +45,6 @@ pub struct EdDecompressCols<T> {
     pub is_real: T,
     pub shard: T,
     pub clk: T,
-    pub nonce: T,
     pub ptr: T,
     pub sign: T,
     pub x_access: GenericArray<MemoryWriteCols<T>, WordsFieldElement>,
@@ -71,9 +70,6 @@ impl<F: PrimeField32> EdDecompressCols<F> {
         self.shard = F::from_canonical_u32(event.shard);
         self.clk = F::from_canonical_u32(event.clk);
         self.ptr = F::from_canonical_u32(event.ptr);
-        self.nonce = F::from_canonical_u32(
-            record.nonce_lookup.get(event.lookup_id.0 as usize).copied().unwrap_or_default(),
-        );
         self.sign = F::from_bool(event.sign);
         for i in 0..8 {
             self.x_access[i].populate(event.x_memory_records[i], &mut new_byte_lookup_events);
@@ -99,7 +95,9 @@ impl<F: PrimeField32> EdDecompressCols<F> {
         let dyy = self.dyy.populate(blu_events, shard, &E::d_biguint(), &yy, FieldOperation::Mul);
         let v = self.v.populate(blu_events, shard, &one, &dyy, FieldOperation::Add);
         let u_div_v = self.u_div_v.populate(blu_events, shard, &u, &v, FieldOperation::Div);
-        let x = self.x.populate(blu_events, shard, &u_div_v, ed25519_sqrt);
+        let x = self.x.populate(blu_events, shard, &u_div_v, |p| {
+            ed25519_sqrt(p).expect("ed25519_sqrt failed, syscall invariant violated")
+        });
         self.neg_x.populate(blu_events, shard, &BigUint::zero(), &x, FieldOperation::Sub);
     }
 }
@@ -181,7 +179,6 @@ impl<V: Copy> EdDecompressCols<V> {
         builder.receive_syscall(
             self.shard,
             self.clk,
-            self.nonce,
             AB::F::from_canonical_u32(SyscallCode::ED_DECOMPRESS.syscall_id()),
             self.ptr,
             self.sign,
@@ -244,20 +241,7 @@ impl<F: PrimeField32, E: EdwardsParameters> MachineAir<F> for EdDecompressChip<E
             input.fixed_log2_rows::<F, _>(self),
         );
 
-        let mut trace = RowMajorMatrix::new(
-            rows.into_iter().flatten().collect::<Vec<_>>(),
-            NUM_ED_DECOMPRESS_COLS,
-        );
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut EdDecompressCols<F> = trace.values
-                [i * NUM_ED_DECOMPRESS_COLS..(i + 1) * NUM_ED_DECOMPRESS_COLS]
-                .borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_ED_DECOMPRESS_COLS)
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -267,6 +251,10 @@ impl<F: PrimeField32, E: EdwardsParameters> MachineAir<F> for EdDecompressChip<E
             !shard.get_precompile_events(SyscallCode::ED_DECOMPRESS).is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<F, E: EdwardsParameters> BaseAir<F> for EdDecompressChip<E> {
@@ -283,12 +271,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &EdDecompressCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &EdDecompressCols<AB::Var> = (*next).borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         local.eval::<AB, E::BaseField, E>(builder);
     }
@@ -300,12 +282,13 @@ pub mod tests {
     use sp1_stark::CpuProver;
     use test_artifacts::ED_DECOMPRESS_ELF;
 
-    use crate::utils;
+    use crate::{io::SP1Stdin, utils};
 
     #[test]
     fn test_ed_decompress() {
         utils::setup_logger();
         let program = Program::from(ED_DECOMPRESS_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 }
diff --git a/crates/core/machine/src/syscall/precompiles/fptower/fp.rs b/crates/core/machine/src/syscall/precompiles/fptower/fp.rs
index a9c21016c7..d74b682adc 100644
--- a/crates/core/machine/src/syscall/precompiles/fptower/fp.rs
+++ b/crates/core/machine/src/syscall/precompiles/fptower/fp.rs
@@ -8,7 +8,7 @@ use crate::{air::MemoryAirBuilder, utils::zeroed_f_vec};
 use generic_array::GenericArray;
 use itertools::Itertools;
 use num::{BigUint, Zero};
-use p3_air::{Air, AirBuilder, BaseAir};
+use p3_air::{Air, BaseAir};
 use p3_field::{AbstractField, PrimeField32};
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use sp1_core_executor::{
@@ -43,7 +43,6 @@ pub struct FpOpChip<P> {
 pub struct FpOpCols<T, P: FpOpField> {
     pub is_real: T,
     pub shard: T,
-    pub nonce: T,
     pub clk: T,
     pub is_add: T,
     pub is_sub: T,
@@ -88,8 +87,8 @@ impl<F: PrimeField32, P: FpOpField> MachineAir<F> for FpOpChip<P> {
     }
 
     fn generate_trace(&self, input: &Self::Record, output: &mut Self::Record) -> RowMajorMatrix<F> {
-        // All the fp events for a given curve are coalesce to the curve's Add operation.  Only retrieve
-        // precompile events for that operation.
+        // All the fp events for a given curve are coalesce to the curve's Add operation.  Only
+        // retrieve precompile events for that operation.
         // TODO:  Fix this.
 
         let events = match P::FIELD_TYPE {
@@ -165,17 +164,7 @@ impl<F: PrimeField32, P: FpOpField> MachineAir<F> for FpOpChip<P> {
         );
 
         // Convert the trace to a row major matrix.
-        let mut trace =
-            RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), num_fp_cols::<P>());
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut FpOpCols<F, P> =
-                trace.values[i * num_fp_cols::<P>()..(i + 1) * num_fp_cols::<P>()].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), num_fp_cols::<P>())
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -202,6 +191,10 @@ impl<F: PrimeField32, P: FpOpField> MachineAir<F> for FpOpChip<P> {
             }
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<F, P: FpOpField> BaseAir<F> for FpOpChip<P> {
@@ -219,12 +212,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &FpOpCols<AB::Var, P> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &FpOpCols<AB::Var, P> = (*next).borrow();
-
-        // Check that nonce is incremented.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         // Check that operations flags are boolean.
         builder.assert_bool(local.is_add);
@@ -295,7 +282,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             syscall_id_felt,
             local.x_ptr,
             local.y_ptr,
diff --git a/crates/core/machine/src/syscall/precompiles/fptower/fp2_addsub.rs b/crates/core/machine/src/syscall/precompiles/fptower/fp2_addsub.rs
index 7f28309597..ee47aef714 100644
--- a/crates/core/machine/src/syscall/precompiles/fptower/fp2_addsub.rs
+++ b/crates/core/machine/src/syscall/precompiles/fptower/fp2_addsub.rs
@@ -8,7 +8,7 @@ use crate::{air::MemoryAirBuilder, utils::zeroed_f_vec};
 use generic_array::GenericArray;
 use itertools::Itertools;
 use num::{BigUint, Zero};
-use p3_air::{Air, AirBuilder, BaseAir};
+use p3_air::{Air, BaseAir};
 use p3_field::{AbstractField, PrimeField32};
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use sp1_core_executor::{
@@ -40,7 +40,6 @@ pub const fn num_fp2_addsub_cols<P: FpOpField>() -> usize {
 pub struct Fp2AddSubAssignCols<T, P: FpOpField> {
     pub is_real: T,
     pub shard: T,
-    pub nonce: T,
     pub clk: T,
     pub is_add: T,
     pub x_ptr: T,
@@ -91,8 +90,8 @@ impl<F: PrimeField32, P: FpOpField> MachineAir<F> for Fp2AddSubAssignChip<P> {
     }
 
     fn generate_trace(&self, input: &Self::Record, output: &mut Self::Record) -> RowMajorMatrix<F> {
-        // All the fp2 sub and add events for a given curve are coalesce to the curve's Add operation.  Only retrieve
-        // precompile events for that operation.
+        // All the fp2 sub and add events for a given curve are coalesce to the curve's Add
+        // operation.  Only retrieve precompile events for that operation.
         // TODO:  Fix this.
 
         let events = match P::FIELD_TYPE {
@@ -175,25 +174,15 @@ impl<F: PrimeField32, P: FpOpField> MachineAir<F> for Fp2AddSubAssignChip<P> {
         );
 
         // Convert the trace to a row major matrix.
-        let mut trace = RowMajorMatrix::new(
+        RowMajorMatrix::new(
             rows.into_iter().flatten().collect::<Vec<_>>(),
             num_fp2_addsub_cols::<P>(),
-        );
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut Fp2AddSubAssignCols<F, P> = trace.values
-                [i * num_fp2_addsub_cols::<P>()..(i + 1) * num_fp2_addsub_cols::<P>()]
-                .borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        )
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
-        // All the fp2 sub and add events for a given curve are coalesce to the curve's Add operation.  Only retrieve
-        // precompile events for that operation.
+        // All the fp2 sub and add events for a given curve are coalesce to the curve's Add
+        // operation.  Only retrieve precompile events for that operation.
         // TODO:  Fix this.
 
         assert!(
@@ -214,6 +203,10 @@ impl<F: PrimeField32, P: FpOpField> MachineAir<F> for Fp2AddSubAssignChip<P> {
             }
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<F, P: FpOpField> BaseAir<F> for Fp2AddSubAssignChip<P> {
@@ -231,14 +224,10 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &Fp2AddSubAssignCols<AB::Var, P> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &Fp2AddSubAssignCols<AB::Var, P> = (*next).borrow();
 
         // Constrain the `is_add` flag to be boolean.
         builder.assert_bool(local.is_add);
 
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
         let num_words_field_element = <P as NumLimbs>::Limbs::USIZE / 4;
 
         let p_x = limbs_from_prev_access(&local.x_access[0..num_words_field_element]);
@@ -318,7 +307,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             syscall_id_felt,
             local.x_ptr,
             local.y_ptr,
diff --git a/crates/core/machine/src/syscall/precompiles/fptower/fp2_mul.rs b/crates/core/machine/src/syscall/precompiles/fptower/fp2_mul.rs
index 95a624cc2f..9c26bab1db 100644
--- a/crates/core/machine/src/syscall/precompiles/fptower/fp2_mul.rs
+++ b/crates/core/machine/src/syscall/precompiles/fptower/fp2_mul.rs
@@ -7,7 +7,7 @@ use crate::{air::MemoryAirBuilder, utils::zeroed_f_vec};
 use generic_array::GenericArray;
 use itertools::Itertools;
 use num::{BigUint, Zero};
-use p3_air::{Air, AirBuilder, BaseAir};
+use p3_air::{Air, BaseAir};
 use p3_field::{AbstractField, PrimeField32};
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use sp1_core_executor::{
@@ -40,7 +40,6 @@ pub const fn num_fp2_mul_cols<P: FieldParameters + NumWords>() -> usize {
 pub struct Fp2MulAssignCols<T, P: FieldParameters + NumWords> {
     pub is_real: T,
     pub shard: T,
-    pub nonce: T,
     pub clk: T,
     pub x_ptr: T,
     pub y_ptr: T,
@@ -214,20 +213,7 @@ impl<F: PrimeField32, P: FpOpField> MachineAir<F> for Fp2MulAssignChip<P> {
         );
 
         // Convert the trace to a row major matrix.
-        let mut trace = RowMajorMatrix::new(
-            rows.into_iter().flatten().collect::<Vec<_>>(),
-            num_fp2_mul_cols::<P>(),
-        );
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut Fp2MulAssignCols<F, P> = trace.values
-                [i * num_fp2_mul_cols::<P>()..(i + 1) * num_fp2_mul_cols::<P>()]
-                .borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), num_fp2_mul_cols::<P>())
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -244,6 +230,10 @@ impl<F: PrimeField32, P: FpOpField> MachineAir<F> for Fp2MulAssignChip<P> {
             }
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<F, P: FpOpField> BaseAir<F> for Fp2MulAssignChip<P> {
@@ -261,11 +251,7 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &Fp2MulAssignCols<AB::Var, P> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &Fp2MulAssignCols<AB::Var, P> = (*next).borrow();
 
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
         let num_words_field_element = <P as NumLimbs>::Limbs::USIZE / 4;
 
         let p_x = limbs_from_prev_access(&local.x_access[0..num_words_field_element]);
@@ -371,7 +357,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             syscall_id_felt,
             local.x_ptr,
             local.y_ptr,
diff --git a/crates/core/machine/src/syscall/precompiles/fptower/mod.rs b/crates/core/machine/src/syscall/precompiles/fptower/mod.rs
index bf097adc05..752b5ae422 100644
--- a/crates/core/machine/src/syscall/precompiles/fptower/mod.rs
+++ b/crates/core/machine/src/syscall/precompiles/fptower/mod.rs
@@ -16,47 +16,53 @@ mod tests {
         BN254_FP2_MUL_ELF, BN254_FP_ELF,
     };
 
-    use crate::utils;
+    use crate::{io::SP1Stdin, utils};
 
     #[test]
     fn test_bls12381_fp_ops() {
         utils::setup_logger();
         let program = Program::from(BLS12381_FP_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bls12381_fp2_addsub() {
         utils::setup_logger();
         let program = Program::from(BLS12381_FP2_ADDSUB_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bls12381_fp2_mul() {
         utils::setup_logger();
         let program = Program::from(BLS12381_FP2_MUL_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bn254_fp_ops() {
         utils::setup_logger();
         let program = Program::from(BN254_FP_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bn254_fp2_addsub() {
         utils::setup_logger();
         let program = Program::from(BN254_FP2_ADDSUB_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bn254_fp2_mul() {
         utils::setup_logger();
         let program = Program::from(BN254_FP2_MUL_ELF).unwrap();
-        utils::run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 }
diff --git a/crates/core/machine/src/syscall/precompiles/keccak256/air.rs b/crates/core/machine/src/syscall/precompiles/keccak256/air.rs
index d6c46cefce..4e299ec7ae 100644
--- a/crates/core/machine/src/syscall/precompiles/keccak256/air.rs
+++ b/crates/core/machine/src/syscall/precompiles/keccak256/air.rs
@@ -33,10 +33,6 @@ where
         let local: &KeccakMemCols<AB::Var> = (*local).borrow();
         let next: &KeccakMemCols<AB::Var> = (*next).borrow();
 
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
-
         let first_step = local.keccak.step_flags[0];
         let final_step = local.keccak.step_flags[NUM_ROUNDS - 1];
         let not_final_step = AB::Expr::one() - final_step;
@@ -66,7 +62,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             AB::F::from_canonical_u32(SyscallCode::KECCAK_PERMUTE.syscall_id()),
             local.state_addr,
             AB::Expr::zero(),
@@ -140,14 +135,15 @@ mod test {
     use crate::{
         io::SP1Stdin,
         riscv::RiscvAir,
-        utils::{prove, setup_logger},
+        utils::{prove_core, setup_logger},
     };
     use sp1_primitives::io::SP1PublicValues;
 
     use rand::{Rng, SeedableRng};
-    use sp1_core_executor::Program;
+    use sp1_core_executor::{Program, SP1Context};
     use sp1_stark::{
-        baby_bear_poseidon2::BabyBearPoseidon2, CpuProver, SP1CoreOpts, StarkGenericConfig,
+        baby_bear_poseidon2::BabyBearPoseidon2, CpuProver, MachineProver, SP1CoreOpts,
+        StarkGenericConfig,
     };
     use test_artifacts::KECCAK256_ELF;
     use tiny_keccak::Hasher;
@@ -181,9 +177,21 @@ mod test {
         let config = BabyBearPoseidon2::new();
 
         let program = Program::from(KECCAK256_ELF).unwrap();
-        let (proof, public_values, _) =
-            prove::<_, CpuProver<_, _>>(program, &stdin, config, SP1CoreOpts::default(), None)
-                .unwrap();
+        let opts = SP1CoreOpts::default();
+        let machine = RiscvAir::machine(config);
+        let prover = CpuProver::new(machine);
+        let (pk, vk) = prover.setup(&program);
+        let (proof, public_values, _) = prove_core::<_, _>(
+            &prover,
+            &pk,
+            &vk,
+            program,
+            &stdin,
+            opts,
+            SP1Context::default(),
+            None,
+        )
+        .unwrap();
         let mut public_values = SP1PublicValues::from(&public_values);
 
         let config = BabyBearPoseidon2::new();
diff --git a/crates/core/machine/src/syscall/precompiles/keccak256/columns.rs b/crates/core/machine/src/syscall/precompiles/keccak256/columns.rs
index 7b622b3bc1..68e4035d18 100644
--- a/crates/core/machine/src/syscall/precompiles/keccak256/columns.rs
+++ b/crates/core/machine/src/syscall/precompiles/keccak256/columns.rs
@@ -19,7 +19,6 @@ pub(crate) struct KeccakMemCols<T> {
 
     pub shard: T,
     pub clk: T,
-    pub nonce: T,
     pub state_addr: T,
 
     /// Memory columns for the state.
diff --git a/crates/core/machine/src/syscall/precompiles/keccak256/mod.rs b/crates/core/machine/src/syscall/precompiles/keccak256/mod.rs
index 348702a570..ba9b76b022 100644
--- a/crates/core/machine/src/syscall/precompiles/keccak256/mod.rs
+++ b/crates/core/machine/src/syscall/precompiles/keccak256/mod.rs
@@ -4,7 +4,7 @@ mod trace;
 
 use p3_keccak_air::KeccakAir;
 
-pub(crate) const STATE_SIZE: usize = 25;
+pub const STATE_SIZE: usize = 25;
 
 // The permutation state is 25 u64's.  Our word size is 32 bits, so it is 50 words.
 pub const STATE_NUM_WORDS: usize = STATE_SIZE * 2;
@@ -25,7 +25,10 @@ pub mod permute_tests {
     use sp1_stark::{CpuProver, SP1CoreOpts};
     use test_artifacts::KECCAK_PERMUTE_ELF;
 
-    use crate::utils::{self, run_test};
+    use crate::{
+        io::SP1Stdin,
+        utils::{self},
+    };
 
     pub fn keccak_permute_program() -> Program {
         let digest_ptr = 100;
@@ -58,13 +61,15 @@ pub mod permute_tests {
         utils::setup_logger();
 
         let program = keccak_permute_program();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_keccak_permute_program_prove() {
         utils::setup_logger();
         let program = Program::from(KECCAK_PERMUTE_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        utils::run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 }
diff --git a/crates/core/machine/src/syscall/precompiles/keccak256/trace.rs b/crates/core/machine/src/syscall/precompiles/keccak256/trace.rs
index 020b28c9f0..900ff55878 100644
--- a/crates/core/machine/src/syscall/precompiles/keccak256/trace.rs
+++ b/crates/core/machine/src/syscall/precompiles/keccak256/trace.rs
@@ -96,16 +96,7 @@ impl<F: PrimeField32> MachineAir<F> for KeccakPermuteChip {
             });
 
         // Convert the trace to a row major matrix.
-        let mut trace = RowMajorMatrix::new(values, NUM_KECCAK_MEM_COLS);
-
-        // Write the nonce to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut KeccakMemCols<F> =
-                trace.values[i * NUM_KECCAK_MEM_COLS..(i + 1) * NUM_KECCAK_MEM_COLS].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(values, NUM_KECCAK_MEM_COLS)
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
diff --git a/crates/core/machine/src/syscall/precompiles/sha256/compress/air.rs b/crates/core/machine/src/syscall/precompiles/sha256/compress/air.rs
index 2ecb8deb37..066a5db297 100644
--- a/crates/core/machine/src/syscall/precompiles/sha256/compress/air.rs
+++ b/crates/core/machine/src/syscall/precompiles/sha256/compress/air.rs
@@ -39,10 +39,6 @@ where
         let local: &ShaCompressCols<AB::Var> = (*local).borrow();
         let next: &ShaCompressCols<AB::Var> = (*next).borrow();
 
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
-
         self.eval_control_flow_flags(builder, local, next);
 
         self.eval_memory(builder, local);
@@ -55,7 +51,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             AB::F::from_canonical_u32(SyscallCode::SHA_COMPRESS.syscall_id()),
             local.w_ptr,
             local.h_ptr,
diff --git a/crates/core/machine/src/syscall/precompiles/sha256/compress/columns.rs b/crates/core/machine/src/syscall/precompiles/sha256/compress/columns.rs
index 5d48b9edcc..c5510142df 100644
--- a/crates/core/machine/src/syscall/precompiles/sha256/compress/columns.rs
+++ b/crates/core/machine/src/syscall/precompiles/sha256/compress/columns.rs
@@ -25,7 +25,6 @@ pub const NUM_SHA_COMPRESS_COLS: usize = size_of::<ShaCompressCols<u8>>();
 pub struct ShaCompressCols<T> {
     /// Inputs.
     pub shard: T,
-    pub nonce: T,
     pub clk: T,
     pub w_ptr: T,
     pub h_ptr: T,
diff --git a/crates/core/machine/src/syscall/precompiles/sha256/compress/mod.rs b/crates/core/machine/src/syscall/precompiles/sha256/compress/mod.rs
index bd15cce2dd..08e58cca31 100644
--- a/crates/core/machine/src/syscall/precompiles/sha256/compress/mod.rs
+++ b/crates/core/machine/src/syscall/precompiles/sha256/compress/mod.rs
@@ -36,7 +36,10 @@ pub mod compress_tests {
     use sp1_stark::CpuProver;
     use test_artifacts::SHA_COMPRESS_ELF;
 
-    use crate::utils::{run_test, setup_logger};
+    use crate::{
+        io::SP1Stdin,
+        utils::{run_test, setup_logger},
+    };
 
     pub fn sha_compress_program() -> Program {
         let w_ptr = 100;
@@ -67,13 +70,15 @@ pub mod compress_tests {
     fn prove_babybear() {
         setup_logger();
         let program = sha_compress_program();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_sha_compress_program() {
         setup_logger();
         let program = Program::from(SHA_COMPRESS_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 }
diff --git a/crates/core/machine/src/syscall/precompiles/sha256/compress/trace.rs b/crates/core/machine/src/syscall/precompiles/sha256/compress/trace.rs
index d6b61b67f2..5333de82cb 100644
--- a/crates/core/machine/src/syscall/precompiles/sha256/compress/trace.rs
+++ b/crates/core/machine/src/syscall/precompiles/sha256/compress/trace.rs
@@ -3,7 +3,7 @@ use std::borrow::BorrowMut;
 use hashbrown::HashMap;
 use itertools::Itertools;
 use p3_field::PrimeField32;
-use p3_matrix::{dense::RowMajorMatrix, Matrix};
+use p3_matrix::dense::RowMajorMatrix;
 use p3_maybe_rayon::prelude::{ParallelIterator, ParallelSlice};
 use sp1_core_executor::{
     events::{ByteLookupEvent, ByteRecord, PrecompileEvent, ShaCompressEvent},
@@ -77,20 +77,7 @@ impl<F: PrimeField32> MachineAir<F> for ShaCompressChip {
         }
 
         // Convert the trace to a row major matrix.
-        let mut trace = RowMajorMatrix::new(
-            rows.into_iter().flatten().collect::<Vec<_>>(),
-            NUM_SHA_COMPRESS_COLS,
-        );
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut ShaCompressCols<F> = trace.values
-                [i * NUM_SHA_COMPRESS_COLS..(i + 1) * NUM_SHA_COMPRESS_COLS]
-                .borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_SHA_COMPRESS_COLS)
     }
 
     fn generate_dependencies(&self, input: &Self::Record, output: &mut Self::Record) {
diff --git a/crates/core/machine/src/syscall/precompiles/sha256/extend/air.rs b/crates/core/machine/src/syscall/precompiles/sha256/extend/air.rs
index f5da0f247a..17e8648918 100644
--- a/crates/core/machine/src/syscall/precompiles/sha256/extend/air.rs
+++ b/crates/core/machine/src/syscall/precompiles/sha256/extend/air.rs
@@ -33,10 +33,6 @@ where
         let local: &ShaExtendCols<AB::Var> = (*local).borrow();
         let next: &ShaExtendCols<AB::Var> = (*next).borrow();
 
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
-
         let i_start = AB::F::from_canonical_u32(16);
         let nb_bytes_in_word = AB::F::from_canonical_u32(4);
 
@@ -203,7 +199,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             AB::F::from_canonical_u32(SyscallCode::SHA_EXTEND.syscall_id()),
             local.w_ptr,
             AB::Expr::zero(),
diff --git a/crates/core/machine/src/syscall/precompiles/sha256/extend/columns.rs b/crates/core/machine/src/syscall/precompiles/sha256/extend/columns.rs
index ff7a5f5f7c..69b5fcd2a9 100644
--- a/crates/core/machine/src/syscall/precompiles/sha256/extend/columns.rs
+++ b/crates/core/machine/src/syscall/precompiles/sha256/extend/columns.rs
@@ -17,7 +17,6 @@ pub const NUM_SHA_EXTEND_COLS: usize = size_of::<ShaExtendCols<u8>>();
 pub struct ShaExtendCols<T> {
     /// Inputs.
     pub shard: T,
-    pub nonce: T,
     pub clk: T,
     pub w_ptr: T,
 
diff --git a/crates/core/machine/src/syscall/precompiles/sha256/extend/mod.rs b/crates/core/machine/src/syscall/precompiles/sha256/extend/mod.rs
index e978902812..53eae24694 100644
--- a/crates/core/machine/src/syscall/precompiles/sha256/extend/mod.rs
+++ b/crates/core/machine/src/syscall/precompiles/sha256/extend/mod.rs
@@ -39,7 +39,10 @@ pub mod extend_tests {
     use sp1_stark::{air::MachineAir, CpuProver};
     use test_artifacts::{SHA2_ELF, SHA_EXTEND_ELF};
 
-    use crate::utils::{self, run_test};
+    use crate::{
+        io::SP1Stdin,
+        utils::{self, run_test},
+    };
 
     use super::ShaExtendChip;
 
@@ -75,20 +78,23 @@ pub mod extend_tests {
     fn test_sha_prove() {
         utils::setup_logger();
         let program = sha_extend_program();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_sha256_program() {
         utils::setup_logger();
         let program = Program::from(SHA2_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_sha_extend_program() {
         utils::setup_logger();
         let program = Program::from(SHA_EXTEND_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 }
diff --git a/crates/core/machine/src/syscall/precompiles/sha256/extend/trace.rs b/crates/core/machine/src/syscall/precompiles/sha256/extend/trace.rs
index 75e1a16533..ca61b642c3 100644
--- a/crates/core/machine/src/syscall/precompiles/sha256/extend/trace.rs
+++ b/crates/core/machine/src/syscall/precompiles/sha256/extend/trace.rs
@@ -1,7 +1,7 @@
 use hashbrown::HashMap;
 use itertools::Itertools;
 use p3_field::PrimeField32;
-use p3_matrix::{dense::RowMajorMatrix, Matrix};
+use p3_matrix::dense::RowMajorMatrix;
 use p3_maybe_rayon::prelude::{ParallelIterator, ParallelSlice};
 use sp1_core_executor::{
     events::{ByteLookupEvent, ByteRecord, PrecompileEvent, ShaExtendEvent},
@@ -51,19 +51,7 @@ impl<F: PrimeField32> MachineAir<F> for ShaExtendChip {
         }
 
         // Convert the trace to a row major matrix.
-        let mut trace = RowMajorMatrix::new(
-            rows.into_iter().flatten().collect::<Vec<_>>(),
-            NUM_SHA_EXTEND_COLS,
-        );
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut ShaExtendCols<F> =
-                trace.values[i * NUM_SHA_EXTEND_COLS..(i + 1) * NUM_SHA_EXTEND_COLS].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_SHA_EXTEND_COLS)
     }
 
     fn generate_dependencies(&self, input: &Self::Record, output: &mut Self::Record) {
diff --git a/crates/core/machine/src/syscall/precompiles/u256x2048_mul/air.rs b/crates/core/machine/src/syscall/precompiles/u256x2048_mul/air.rs
index 43a9906581..054bf4f4fe 100644
--- a/crates/core/machine/src/syscall/precompiles/u256x2048_mul/air.rs
+++ b/crates/core/machine/src/syscall/precompiles/u256x2048_mul/air.rs
@@ -278,7 +278,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             AB::F::from_canonical_u32(SyscallCode::U256XU2048_MUL.syscall_id()),
             local.a_ptr,
             local.b_ptr,
diff --git a/crates/core/machine/src/syscall/precompiles/u256x2048_mul/mod.rs b/crates/core/machine/src/syscall/precompiles/u256x2048_mul/mod.rs
index df525659d8..75a64dd9b3 100644
--- a/crates/core/machine/src/syscall/precompiles/u256x2048_mul/mod.rs
+++ b/crates/core/machine/src/syscall/precompiles/u256x2048_mul/mod.rs
@@ -24,7 +24,10 @@ mod tests {
 
     use crate::{
         io::SP1Stdin,
-        utils::{self, run_test_io, uni_stark_prove as prove, uni_stark_verify as verify},
+        utils::{
+            self, run_test,
+            uni_stark::{uni_stark_prove, uni_stark_verify},
+        },
     };
     use crate::{
         syscall::precompiles::u256x2048_mul::air::U256x2048MulChip, utils::words_to_bytes_le_vec,
@@ -169,7 +172,7 @@ mod tests {
     fn test_uint256_mul() {
         utils::setup_logger();
         let program = Program::from(U256XU2048_MUL_ELF).unwrap();
-        run_test_io::<CpuProver<_, _>>(program, SP1Stdin::new()).unwrap();
+        run_test::<CpuProver<_, _>>(program, SP1Stdin::new()).unwrap();
     }
 
     #[test]
@@ -179,8 +182,13 @@ mod tests {
         let chip = U256x2048MulChip::new();
         let trace: RowMajorMatrix<BabyBear> =
             chip.generate_trace(&execution_record, &mut ExecutionRecord::default());
-        let proof = prove::<BabyBearPoseidon2, _>(&config, &chip, &mut config.challenger(), trace);
-        verify(&config, &chip, &mut config.challenger(), &proof).unwrap();
+        let proof = uni_stark_prove::<BabyBearPoseidon2, _>(
+            &config,
+            &chip,
+            &mut config.challenger(),
+            trace,
+        );
+        uni_stark_verify(&config, &chip, &mut config.challenger(), &proof).unwrap();
     }
 
     #[test]
@@ -191,9 +199,13 @@ mod tests {
             let chip = U256x2048MulChip::new();
             let trace: RowMajorMatrix<BabyBear> =
                 chip.generate_trace(&execution_record, &mut ExecutionRecord::default());
-            let proof =
-                prove::<BabyBearPoseidon2, _>(&config, &chip, &mut config.challenger(), trace);
-            let result = verify(&config, &chip, &mut config.challenger(), &proof);
+            let proof = uni_stark_prove::<BabyBearPoseidon2, _>(
+                &config,
+                &chip,
+                &mut config.challenger(),
+                trace,
+            );
+            let result = uni_stark_verify(&config, &chip, &mut config.challenger(), &proof);
             assert!(result.is_err());
         }
     }
diff --git a/crates/core/machine/src/syscall/precompiles/uint256/air.rs b/crates/core/machine/src/syscall/precompiles/uint256/air.rs
index 54e0925f9e..3e10f9c4f3 100644
--- a/crates/core/machine/src/syscall/precompiles/uint256/air.rs
+++ b/crates/core/machine/src/syscall/precompiles/uint256/air.rs
@@ -14,7 +14,7 @@ use crate::{
 
 use generic_array::GenericArray;
 use num::{BigUint, One, Zero};
-use p3_air::{Air, AirBuilder, BaseAir};
+use p3_air::{Air, BaseAir};
 use p3_field::{AbstractField, PrimeField32};
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use sp1_core_executor::{
@@ -62,9 +62,6 @@ pub struct Uint256MulCols<T> {
     /// The clock cycle of the syscall.
     pub clk: T,
 
-    /// The nonce of the operation.
-    pub nonce: T,
-
     /// The pointer to the first input.
     pub x_ptr: T,
 
@@ -207,17 +204,7 @@ impl<F: PrimeField32> MachineAir<F> for Uint256MulChip {
         );
 
         // Convert the trace to a row major matrix.
-        let mut trace =
-            RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_COLS);
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut Uint256MulCols<F> =
-                trace.values[i * NUM_COLS..(i + 1) * NUM_COLS].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), NUM_COLS)
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -227,6 +214,10 @@ impl<F: PrimeField32> MachineAir<F> for Uint256MulChip {
             !shard.get_precompile_events(SyscallCode::UINT256_MUL).is_empty()
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<F> BaseAir<F> for Uint256MulChip {
@@ -244,12 +235,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &Uint256MulCols<AB::Var> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &Uint256MulCols<AB::Var> = (*next).borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         // We are computing (x * y) % modulus. The value of x is stored in the "prev_value" of
         // the x_memory, since we write to it later.
@@ -331,7 +316,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             AB::F::from_canonical_u32(SyscallCode::UINT256_MUL.syscall_id()),
             local.x_ptr,
             local.y_ptr,
diff --git a/crates/core/machine/src/syscall/precompiles/uint256/mod.rs b/crates/core/machine/src/syscall/precompiles/uint256/mod.rs
index 7bea9fcca3..fb2cb787be 100644
--- a/crates/core/machine/src/syscall/precompiles/uint256/mod.rs
+++ b/crates/core/machine/src/syscall/precompiles/uint256/mod.rs
@@ -12,14 +12,14 @@ mod tests {
 
     use crate::{
         io::SP1Stdin,
-        utils::{self, run_test_io},
+        utils::{self, run_test},
     };
 
     #[test]
     fn test_uint256_mul() {
         utils::setup_logger();
         let program = Program::from(UINT256_MUL_ELF).unwrap();
-        run_test_io::<CpuProver<_, _>>(program, SP1Stdin::new()).unwrap();
+        run_test::<CpuProver<_, _>>(program, SP1Stdin::new()).unwrap();
     }
 
     #[test]
diff --git a/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_add.rs b/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_add.rs
index ac82fb88ce..ea03a08587 100644
--- a/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_add.rs
+++ b/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_add.rs
@@ -47,7 +47,6 @@ pub const fn num_weierstrass_add_cols<P: FieldParameters + NumWords>() -> usize
 pub struct WeierstrassAddAssignCols<T, P: FieldParameters + NumWords> {
     pub is_real: T,
     pub shard: T,
-    pub nonce: T,
     pub clk: T,
     pub p_ptr: T,
     pub q_ptr: T,
@@ -252,18 +251,7 @@ impl<F: PrimeField32, E: EllipticCurve + WeierstrassParameters> MachineAir<F>
         });
 
         // Convert the trace to a row major matrix.
-        let mut trace = RowMajorMatrix::new(values, num_weierstrass_add_cols::<E::BaseField>());
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut WeierstrassAddAssignCols<F, E::BaseField> = trace.values[i
-                * num_weierstrass_add_cols::<E::BaseField>()
-                ..(i + 1) * num_weierstrass_add_cols::<E::BaseField>()]
-                .borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(values, num_weierstrass_add_cols::<E::BaseField>())
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -285,6 +273,10 @@ impl<F: PrimeField32, E: EllipticCurve + WeierstrassParameters> MachineAir<F>
             }
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<F, E: EllipticCurve> BaseAir<F> for WeierstrassAddAssignChip<E> {
@@ -302,12 +294,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &WeierstrassAddAssignCols<AB::Var, E::BaseField> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &WeierstrassAddAssignCols<AB::Var, E::BaseField> = (*next).borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         let num_words_field_element = <E::BaseField as NumLimbs>::Limbs::USIZE / 4;
 
@@ -418,7 +404,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             syscall_id_felt,
             local.p_ptr,
             local.q_ptr,
@@ -471,61 +456,72 @@ mod tests {
         SECP256K1_ADD_ELF, SECP256K1_MUL_ELF, SECP256R1_ADD_ELF,
     };
 
-    use crate::utils::{run_test, setup_logger};
+    use crate::{
+        io::SP1Stdin,
+        utils::{run_test, setup_logger},
+    };
 
     #[test]
     fn test_secp256k1_add_simple() {
         setup_logger();
         let program = Program::from(SECP256K1_ADD_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_secp256r1_add_simple() {
         setup_logger();
         let program = Program::from(SECP256R1_ADD_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bn254_add_simple() {
         setup_logger();
         let program = Program::from(BN254_ADD_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bn254_mul_simple() {
         setup_logger();
         let program = Program::from(BN254_MUL_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_secp256k1_mul_simple() {
         setup_logger();
         let program = Program::from(SECP256K1_MUL_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bls12381_add_simple() {
         setup_logger();
         let program = Program::from(BLS12381_ADD_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bls12381_double_simple() {
         setup_logger();
         let program = Program::from(BLS12381_DOUBLE_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bls12381_mul_simple() {
         setup_logger();
         let program = Program::from(BLS12381_MUL_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 }
diff --git a/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_decompress.rs b/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_decompress.rs
index da010cf81d..e428259011 100644
--- a/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_decompress.rs
+++ b/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_decompress.rs
@@ -49,7 +49,6 @@ pub struct WeierstrassDecompressCols<T, P: FieldParameters + NumWords> {
     pub is_real: T,
     pub shard: T,
     pub clk: T,
-    pub nonce: T,
     pub ptr: T,
     pub sign_bit: T,
     pub x_access: GenericArray<MemoryReadCols<T>, P::WordsFieldElement>,
@@ -278,16 +277,7 @@ impl<F: PrimeField32, E: EllipticCurve + WeierstrassParameters> MachineAir<F>
             input.fixed_log2_rows::<F, _>(self),
         );
 
-        let mut trace = RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), width);
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut WeierstrassDecompressCols<F, E::BaseField> =
-                trace.values[i * width..i * width + weierstrass_width].borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(rows.into_iter().flatten().collect::<Vec<_>>(), width)
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -308,6 +298,10 @@ impl<F: PrimeField32, E: EllipticCurve + WeierstrassParameters> MachineAir<F>
             }
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<F, E: EllipticCurve> BaseAir<F> for WeierstrassDecompressChip<E> {
@@ -334,13 +328,6 @@ where
         let local_slice = main.row_slice(0);
         let local: &WeierstrassDecompressCols<AB::Var, E::BaseField> =
             (*local_slice)[0..weierstrass_cols].borrow();
-        let next = main.row_slice(1);
-        let next: &WeierstrassDecompressCols<AB::Var, E::BaseField> =
-            (*next)[0..weierstrass_cols].borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         let num_limbs = <E::BaseField as NumLimbs>::Limbs::USIZE;
         let num_words_field_element = num_limbs / 4;
@@ -528,7 +515,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             syscall_id,
             local.ptr,
             local.sign_bit,
@@ -540,7 +526,10 @@ where
 
 #[cfg(test)]
 mod tests {
-    use crate::{io::SP1Stdin, utils};
+    use crate::{
+        io::SP1Stdin,
+        utils::{self, run_test},
+    };
     use amcl::{
         bls381::bls381::{basic::key_pair_generate_g2, utils::deserialize_g1},
         rand::RAND,
@@ -553,8 +542,6 @@ mod tests {
         BLS12381_DECOMPRESS_ELF, SECP256K1_DECOMPRESS_ELF, SECP256R1_DECOMPRESS_ELF,
     };
 
-    use crate::utils::run_test_io;
-
     #[test]
     fn test_weierstrass_bls_decompress() {
         utils::setup_logger();
@@ -570,11 +557,9 @@ mod tests {
             let (_, compressed) = key_pair_generate_g2(&mut rand);
 
             let stdin = SP1Stdin::from(&compressed);
-            let mut public_values = run_test_io::<CpuProver<_, _>>(
-                Program::from(BLS12381_DECOMPRESS_ELF).unwrap(),
-                stdin,
-            )
-            .unwrap();
+            let mut public_values =
+                run_test::<CpuProver<_, _>>(Program::from(BLS12381_DECOMPRESS_ELF).unwrap(), stdin)
+                    .unwrap();
 
             let mut result = [0; 96];
             public_values.read_slice(&mut result);
@@ -604,7 +589,7 @@ mod tests {
 
             let inputs = SP1Stdin::from(&compressed);
 
-            let mut public_values = run_test_io::<CpuProver<_, _>>(
+            let mut public_values = run_test::<CpuProver<_, _>>(
                 Program::from(SECP256K1_DECOMPRESS_ELF).unwrap(),
                 inputs,
             )
@@ -633,7 +618,7 @@ mod tests {
 
             let inputs = SP1Stdin::from(compressed);
 
-            let mut public_values = run_test_io::<CpuProver<_, _>>(
+            let mut public_values = run_test::<CpuProver<_, _>>(
                 Program::from(SECP256R1_DECOMPRESS_ELF).unwrap(),
                 inputs,
             )
diff --git a/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_double.rs b/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_double.rs
index e7a8f9600f..34f3fbfc55 100644
--- a/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_double.rs
+++ b/crates/core/machine/src/syscall/precompiles/weierstrass/weierstrass_double.rs
@@ -46,7 +46,6 @@ pub const fn num_weierstrass_double_cols<P: FieldParameters + NumWords>() -> usi
 pub struct WeierstrassDoubleAssignCols<T, P: FieldParameters + NumWords> {
     pub is_real: T,
     pub shard: T,
-    pub nonce: T,
     pub clk: T,
     pub p_ptr: T,
     pub p_access: GenericArray<MemoryWriteCols<T>, P::WordsCurvePoint>,
@@ -274,18 +273,7 @@ impl<F: PrimeField32, E: EllipticCurve + WeierstrassParameters> MachineAir<F>
         });
 
         // Convert the trace to a row major matrix.
-        let mut trace = RowMajorMatrix::new(values, num_weierstrass_double_cols::<E::BaseField>());
-
-        // Write the nonces to the trace.
-        for i in 0..trace.height() {
-            let cols: &mut WeierstrassDoubleAssignCols<F, E::BaseField> = trace.values[i
-                * num_weierstrass_double_cols::<E::BaseField>()
-                ..(i + 1) * num_weierstrass_double_cols::<E::BaseField>()]
-                .borrow_mut();
-            cols.nonce = F::from_canonical_usize(i);
-        }
-
-        trace
+        RowMajorMatrix::new(values, num_weierstrass_double_cols::<E::BaseField>())
     }
 
     fn included(&self, shard: &Self::Record) -> bool {
@@ -309,6 +297,10 @@ impl<F: PrimeField32, E: EllipticCurve + WeierstrassParameters> MachineAir<F>
             }
         }
     }
+
+    fn local_only(&self) -> bool {
+        true
+    }
 }
 
 impl<E: EllipticCurve + WeierstrassParameters> WeierstrassDoubleAssignChip<E> {
@@ -352,12 +344,6 @@ where
         let main = builder.main();
         let local = main.row_slice(0);
         let local: &WeierstrassDoubleAssignCols<AB::Var, E::BaseField> = (*local).borrow();
-        let next = main.row_slice(1);
-        let next: &WeierstrassDoubleAssignCols<AB::Var, E::BaseField> = (*next).borrow();
-
-        // Constrain the incrementing nonce.
-        builder.when_first_row().assert_zero(local.nonce);
-        builder.when_transition().assert_eq(local.nonce + AB::Expr::one(), next.nonce);
 
         let num_words_field_element = E::BaseField::NB_LIMBS / 4;
         let p_x = limbs_from_prev_access(&local.p_access[0..num_words_field_element]);
@@ -480,7 +466,6 @@ where
         builder.receive_syscall(
             local.shard,
             local.clk,
-            local.nonce,
             syscall_id_felt,
             local.p_ptr,
             AB::Expr::zero(),
@@ -498,33 +483,40 @@ pub mod tests {
         BLS12381_DOUBLE_ELF, BN254_DOUBLE_ELF, SECP256K1_DOUBLE_ELF, SECP256R1_DOUBLE_ELF,
     };
 
-    use crate::utils::{run_test, setup_logger};
+    use crate::{
+        io::SP1Stdin,
+        utils::{run_test, setup_logger},
+    };
 
     #[test]
     fn test_secp256k1_double_simple() {
         setup_logger();
         let program = Program::from(SECP256K1_DOUBLE_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_secp256r1_double_simple() {
         setup_logger();
         let program = Program::from(SECP256R1_DOUBLE_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bn254_double_simple() {
         setup_logger();
         let program = Program::from(BN254_DOUBLE_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 
     #[test]
     fn test_bls12381_double_simple() {
         setup_logger();
         let program = Program::from(BLS12381_DOUBLE_ELF).unwrap();
-        run_test::<CpuProver<_, _>>(program).unwrap();
+        let stdin = SP1Stdin::new();
+        run_test::<CpuProver<_, _>>(program, stdin).unwrap();
     }
 }
diff --git a/crates/core/machine/src/utils/mod.rs b/crates/core/machine/src/utils/mod.rs
index aed57bd751..a16298fe2c 100644
--- a/crates/core/machine/src/utils/mod.rs
+++ b/crates/core/machine/src/utils/mod.rs
@@ -2,16 +2,19 @@ pub mod concurrency;
 mod logger;
 mod prove;
 mod span;
-mod tracer;
+mod test;
+pub mod uni_stark;
 
 pub use logger::*;
 use p3_field::Field;
 pub use prove::*;
 use sp1_curves::params::Limbs;
 pub use span::*;
-pub use tracer::*;
+pub use test::*;
+pub use uni_stark::*;
 
 use crate::memory::MemoryCols;
+
 use generic_array::ArrayLength;
 use p3_maybe_rayon::prelude::{ParallelBridge, ParallelIterator};
 
diff --git a/crates/core/machine/src/utils/prove.rs b/crates/core/machine/src/utils/prove.rs
index bca7ee88a8..fefbc4b839 100644
--- a/crates/core/machine/src/utils/prove.rs
+++ b/crates/core/machine/src/utils/prove.rs
@@ -1,133 +1,83 @@
+use p3_matrix::dense::RowMajorMatrix;
 use std::{
-    collections::VecDeque,
     fs::File,
-    io::{
-        Seek, {self},
+    io::{self, Seek, SeekFrom},
+    sync::{
+        mpsc::{channel, sync_channel, Sender},
+        Arc, Mutex,
     },
-    sync::{mpsc::sync_channel, Arc, Mutex},
+    thread::ScopedJoinHandle,
 };
 use web_time::Instant;
 
 use crate::riscv::{CoreShapeConfig, RiscvAir};
-use p3_challenger::FieldChallenger;
 use p3_maybe_rayon::prelude::*;
-use serde::{de::DeserializeOwned, Serialize};
-use size::Size;
-use sp1_stark::{
-    air::InteractionScope, baby_bear_poseidon2::BabyBearPoseidon2, MachineProvingKey,
-    MachineVerificationError,
-};
-use std::thread::ScopedJoinHandle;
+use sp1_stark::MachineProvingKey;
+use sp1_stark::StarkVerifyingKey;
 use thiserror::Error;
 
-use p3_baby_bear::BabyBear;
 use p3_field::PrimeField32;
-use p3_matrix::Matrix;
+use sp1_stark::air::MachineAir;
 
 use crate::{
     io::SP1Stdin,
-    riscv::cost::CostEstimator,
     utils::{chunk_vec, concurrency::TurnBasedSync},
 };
 use sp1_core_executor::{
     events::{format_table_line, sorted_table_lines},
     ExecutionState,
 };
-use sp1_primitives::io::SP1PublicValues;
 
 use sp1_core_executor::{
     subproof::NoOpSubproofVerifier, ExecutionError, ExecutionRecord, ExecutionReport, Executor,
     Program, SP1Context,
 };
 use sp1_stark::{
-    air::{MachineAir, PublicValues},
-    Com, CpuProver, DebugConstraintBuilder, InteractionBuilder, MachineProof, MachineProver,
-    MachineRecord, OpeningProof, PcsProverData, ProverConstraintFolder, SP1CoreOpts,
-    StarkGenericConfig, StarkMachine, StarkProvingKey, StarkVerifyingKey, UniConfig, Val,
-    VerifierConstraintFolder,
+    air::PublicValues, Com, MachineProof, MachineProver, MachineRecord, OpeningProof,
+    PcsProverData, ProofShape, SP1CoreOpts, ShardProof, StarkGenericConfig, Val,
 };
 
-#[derive(Error, Debug)]
-pub enum SP1CoreProverError {
-    #[error("failed to execute program: {0}")]
-    ExecutionError(ExecutionError),
-    #[error("io error: {0}")]
-    IoError(io::Error),
-    #[error("serialization error: {0}")]
-    SerializationError(bincode::Error),
-}
-
-pub fn prove_simple<SC: StarkGenericConfig, P: MachineProver<SC, RiscvAir<SC::Val>>>(
-    config: SC,
-    mut runtime: Executor,
-) -> Result<(MachineProof<SC>, u64), SP1CoreProverError>
-where
-    SC::Challenger: Clone,
-    OpeningProof<SC>: Send + Sync,
-    Com<SC>: Send + Sync,
-    PcsProverData<SC>: Send + Sync,
-    // ShardMainData<SC>: Serialize + DeserializeOwned,
-    <SC as StarkGenericConfig>::Val: PrimeField32,
-{
-    // Setup the machine.
-    let machine = RiscvAir::machine(config);
-    let prover = P::new(machine);
-    let (pk, _) = prover.setup(runtime.program.as_ref());
-
-    // Set the shard numbers.
-    runtime.records.iter_mut().enumerate().for_each(|(i, shard)| {
-        shard.public_values.shard = (i + 1) as u32;
-    });
-
-    // Prove the program.
-    let mut challenger = prover.config().challenger();
-    let proving_start = Instant::now();
-    let proof =
-        prover.prove(&pk, runtime.records, &mut challenger, SP1CoreOpts::default()).unwrap();
-    let proving_duration = proving_start.elapsed().as_millis();
-    let nb_bytes = bincode::serialize(&proof).unwrap().len();
-
-    // Print the summary.
-    tracing::info!(
-        "summary: cycles={}, e2e={}, khz={:.2}, proofSize={}",
-        runtime.state.global_clk,
-        proving_duration,
-        (runtime.state.global_clk as f64 / proving_duration as f64),
-        Size::from_bytes(nb_bytes),
-    );
-
-    Ok((proof, runtime.state.global_clk))
-}
-
-pub fn prove<SC: StarkGenericConfig, P: MachineProver<SC, RiscvAir<SC::Val>>>(
+#[allow(clippy::too_many_arguments)]
+pub fn prove_core<SC: StarkGenericConfig, P: MachineProver<SC, RiscvAir<SC::Val>>>(
+    prover: &P,
+    pk: &P::DeviceProvingKey,
+    _: &StarkVerifyingKey<SC>,
     program: Program,
     stdin: &SP1Stdin,
-    config: SC,
     opts: SP1CoreOpts,
+    context: SP1Context,
     shape_config: Option<&CoreShapeConfig<SC::Val>>,
 ) -> Result<(MachineProof<SC>, Vec<u8>, u64), SP1CoreProverError>
 where
+    SC::Val: PrimeField32,
     SC::Challenger: 'static + Clone + Send,
-    <SC as StarkGenericConfig>::Val: PrimeField32,
     OpeningProof<SC>: Send,
     Com<SC>: Send + Sync,
     PcsProverData<SC>: Send + Sync,
 {
-    let machine = RiscvAir::machine(config);
-    let prover = P::new(machine);
-    let (pk, _) = prover.setup(&program);
-    prove_with_context::<SC, _>(
-        &prover,
-        &pk,
+    let (proof_tx, proof_rx) = channel();
+    let (shape_tx, shape_rx) = channel();
+    let (public_values, cycles) = prove_core_stream(
+        prover,
+        pk,
         program,
         stdin,
         opts,
-        Default::default(),
+        context,
         shape_config,
-    )
+        proof_tx,
+        shape_tx,
+    )?;
+
+    let _: Vec<_> = shape_rx.iter().collect();
+    let shard_proofs: Vec<ShardProof<SC>> = proof_rx.iter().collect();
+    let proof = MachineProof { shard_proofs };
+
+    Ok((proof, public_values, cycles))
 }
 
-pub fn prove_with_context<SC: StarkGenericConfig, P: MachineProver<SC, RiscvAir<SC::Val>>>(
+#[allow(clippy::too_many_arguments)]
+pub fn prove_core_stream<SC: StarkGenericConfig, P: MachineProver<SC, RiscvAir<SC::Val>>>(
     prover: &P,
     pk: &P::DeviceProvingKey,
     program: Program,
@@ -135,7 +85,9 @@ pub fn prove_with_context<SC: StarkGenericConfig, P: MachineProver<SC, RiscvAir<
     opts: SP1CoreOpts,
     context: SP1Context,
     shape_config: Option<&CoreShapeConfig<SC::Val>>,
-) -> Result<(MachineProof<SC>, Vec<u8>, u64), SP1CoreProverError>
+    proof_tx: Sender<ShardProof<SC>>,
+    shape_and_done_tx: Sender<(ProofShape, bool)>,
+) -> Result<(Vec<u8>, u64), SP1CoreProverError>
 where
     SC::Val: PrimeField32,
     SC::Challenger: 'static + Clone + Send,
@@ -165,7 +117,7 @@ where
         // Spawn the checkpoint generator thread.
         let checkpoint_generator_span = tracing::Span::current().clone();
         let (checkpoints_tx, checkpoints_rx) =
-            sync_channel::<(usize, File, bool)>(opts.checkpoints_channel_capacity);
+            sync_channel::<(usize, File, bool, u64)>(opts.checkpoints_channel_capacity);
         let checkpoint_generator_handle: ScopedJoinHandle<Result<_, SP1CoreProverError>> =
             s.spawn(move || {
                 let _span = checkpoint_generator_span.enter();
@@ -177,7 +129,7 @@ where
                         let _span = span.enter();
 
                         // Execute the runtime until we reach a checkpoint.
-                        let (checkpoint, done) = runtime
+                        let (checkpoint, _, done) = runtime
                             .execute_state(false)
                             .map_err(SP1CoreProverError::ExecutionError)?;
 
@@ -189,7 +141,9 @@ where
                             .map_err(SP1CoreProverError::IoError)?;
 
                         // Send the checkpoint.
-                        checkpoints_tx.send((index, checkpoint_file, done)).unwrap();
+                        checkpoints_tx
+                            .send((index, checkpoint_file, done, runtime.state.global_clk))
+                            .unwrap();
 
                         // If we've reached the final checkpoint, break out of the loop.
                         if done {
@@ -202,263 +156,36 @@ where
                 })
             });
 
-        // Spawn the workers for phase 1 record generation.
-        let p1_record_gen_sync = Arc::new(TurnBasedSync::new());
-        let p1_trace_gen_sync = Arc::new(TurnBasedSync::new());
-        let (p1_records_and_traces_tx, p1_records_and_traces_rx) =
-            sync_channel::<(Vec<ExecutionRecord>, Vec<Vec<(String, RowMajorMatrix<Val<SC>>)>>)>(
-                opts.records_and_traces_channel_capacity,
-            );
-        let p1_records_and_traces_tx = Arc::new(Mutex::new(p1_records_and_traces_tx));
-        let checkpoints_rx = Arc::new(Mutex::new(checkpoints_rx));
-
-        let checkpoints = Arc::new(Mutex::new(VecDeque::new()));
-        let state = Arc::new(Mutex::new(PublicValues::<u32, u32>::default().reset()));
-        let deferred = Arc::new(Mutex::new(ExecutionRecord::new(program.clone().into())));
-        let mut p1_record_and_trace_gen_handles = Vec::new();
-        for _ in 0..opts.trace_gen_workers {
-            let record_gen_sync = Arc::clone(&p1_record_gen_sync);
-            let trace_gen_sync = Arc::clone(&p1_trace_gen_sync);
-            let checkpoints_rx = Arc::clone(&checkpoints_rx);
-            let records_and_traces_tx = Arc::clone(&p1_records_and_traces_tx);
-
-            let checkpoints = Arc::clone(&checkpoints);
-            let state = Arc::clone(&state);
-            let deferred = Arc::clone(&deferred);
-            let program = program.clone();
-
-            let span = tracing::Span::current().clone();
-
-            let handle = s.spawn(move || {
-                let _span = span.enter();
-                tracing::debug_span!("phase 1 trace generation").in_scope(|| {
-                    loop {
-                        // Receive the latest checkpoint.
-                        let received = { checkpoints_rx.lock().unwrap().recv() };
-
-                        if let Ok((index, mut checkpoint, done)) = received {
-                            // Trace the checkpoint and reconstruct the execution records.
-                            let (mut records, _) = tracing::debug_span!("trace checkpoint")
-                                .in_scope(|| {
-                                    trace_checkpoint::<SC>(
-                                        program.clone(),
-                                        &checkpoint,
-                                        opts,
-                                        shape_config,
-                                    )
-                                });
-                            tracing::debug!("generated {} records", records.len());
-                            reset_seek(&mut checkpoint);
-
-                            // Wait for our turn to update the state.
-                            tracing::debug!("waiting for turn {}", index);
-                            record_gen_sync.wait_for_turn(index);
-
-                            // Update the public values & prover state for the shards which contain
-                            // "cpu events".
-                            let mut state = state.lock().unwrap();
-                            for record in records.iter_mut() {
-                                state.shard += 1;
-                                state.execution_shard = record.public_values.execution_shard;
-                                state.start_pc = record.public_values.start_pc;
-                                state.next_pc = record.public_values.next_pc;
-                                state.committed_value_digest =
-                                    record.public_values.committed_value_digest;
-                                state.deferred_proofs_digest =
-                                    record.public_values.deferred_proofs_digest;
-                                record.public_values = *state;
-                            }
-
-                            // Defer events that are too expensive to include in every shard.
-                            let mut deferred = deferred.lock().unwrap();
-                            for record in records.iter_mut() {
-                                deferred.append(&mut record.defer());
-                            }
-
-                            // See if any deferred shards are ready to be committed to.
-                            let mut deferred = deferred.split(done, opts.split_opts);
-                            tracing::debug!("deferred {} records", deferred.len());
-
-                            // Update the public values & prover state for the shards which do not
-                            // contain "cpu events" before committing to them.
-                            if !done {
-                                state.execution_shard += 1;
-                            }
-                            for record in deferred.iter_mut() {
-                                state.shard += 1;
-                                state.previous_init_addr_bits =
-                                    record.public_values.previous_init_addr_bits;
-                                state.last_init_addr_bits =
-                                    record.public_values.last_init_addr_bits;
-                                state.previous_finalize_addr_bits =
-                                    record.public_values.previous_finalize_addr_bits;
-                                state.last_finalize_addr_bits =
-                                    record.public_values.last_finalize_addr_bits;
-                                state.start_pc = state.next_pc;
-                                record.public_values = *state;
-                            }
-                            records.append(&mut deferred);
-
-                            // Collect the checkpoints to be used again in the phase 2 prover.
-                            tracing::debug!("collecting checkpoints");
-                            let mut checkpoints = checkpoints.lock().unwrap();
-                            checkpoints.push_back((index, checkpoint, done));
-
-                            // Let another worker update the state.
-                            record_gen_sync.advance_turn();
-
-                            // Fix the shape of the records.
-                            if let Some(shape_config) = shape_config {
-                                for record in records.iter_mut() {
-                                    tracing::debug!("fixing shape");
-                                    shape_config.fix_shape(record).unwrap();
-                                }
-                            }
-
-                            // Generate the traces.
-                            let mut traces = vec![];
-                            tracing::debug_span!("generate traces", index).in_scope(|| {
-                                traces = records
-                                    .par_iter()
-                                    .map(|record| {
-                                        prover.generate_traces(record, InteractionScope::Global)
-                                    })
-                                    .collect::<Vec<_>>();
-                            });
-
-                            // Wait for our turn.
-                            trace_gen_sync.wait_for_turn(index);
-
-                            // Send the records to the phase 1 prover.
-                            let chunked_records = chunk_vec(records, opts.shard_batch_size);
-                            let chunked_traces = chunk_vec(traces, opts.shard_batch_size);
-                            chunked_records.into_iter().zip(chunked_traces).for_each(
-                                |(records, traces)| {
-                                    records_and_traces_tx
-                                        .lock()
-                                        .unwrap()
-                                        .send((records, traces))
-                                        .unwrap();
-                                },
-                            );
-
-                            trace_gen_sync.advance_turn();
-                        } else {
-                            break;
-                        }
-                    }
-                })
-            });
-            p1_record_and_trace_gen_handles.push(handle);
-        }
-        drop(p1_records_and_traces_tx);
-
         // Create the challenger and observe the verifying key.
         let mut challenger = prover.config().challenger();
         pk.observe_into(&mut challenger);
 
-        // Spawn the phase 1 prover thread.
-        let phase_1_prover_span = tracing::Span::current().clone();
-        let phase_1_prover_handle = s.spawn(move || {
-            let _span = phase_1_prover_span.enter();
-            tracing::debug_span!("phase 1 prover").in_scope(|| {
-                for (records, traces) in p1_records_and_traces_rx.iter() {
-                    tracing::debug_span!("batch").in_scope(|| {
-                        let span = tracing::Span::current().clone();
-
-                        // Collect the public values.
-                        let public_values = records
-                            .iter()
-                            .map(|record| {
-                                record.public_values::<SC::Val>()[0..prover.machine().num_pv_elts()]
-                                    .to_vec()
-                            })
-                            .collect::<Vec<_>>();
-
-                        // Commit to each shard.
-                        let commitments = records
-                            .into_par_iter()
-                            .zip(traces.into_par_iter())
-                            .map(|(record, traces)| {
-                                let _span = span.enter();
-
-                                for (name, trace) in traces.clone() {
-                                    let trace_width = trace.width();
-                                    let trace_height = trace.height();
-                                    tracing::debug!(
-                                        "Phase 1 area: {:<15} | Main Cols = {:<5} | Rows = {:<5} | Cells = {:<10}",
-                                        name,
-                                        trace_width,
-                                        trace_height,
-                                        trace_width * trace_height,
-                                    );
-
-                                }
-
-                                let data = prover.commit(&record, traces);
-                                let phase1_main_commit = data.main_commit.clone();
-                                drop(data);
-                                phase1_main_commit
-                            })
-                            .collect::<Vec<_>>();
-
-                        //  the commitments.
-                        for (commit, public_values) in
-                            commitments.into_iter().zip(public_values.into_iter())
-                        {
-                            prover.observe(&mut challenger, commit.clone(), &public_values);
-                        }
-                    });
-                }
-            });
-
-            challenger
-        });
-
-        // Wait until the checkpoint generator handle has fully finished.
-        let public_values_stream = checkpoint_generator_handle.join().unwrap().unwrap();
-
-        // Wait until the records and traces have been fully generated.
-        p1_record_and_trace_gen_handles.into_iter().for_each(|handle| handle.join().unwrap());
-
-        // Wait until the phase 1 prover has completely finished.
-        let mut challenger = phase_1_prover_handle.join().unwrap();
-
-        // Sample for the global permutation challenges.
-        // Obtain the challenges used for the global permutation argument.
-        let mut global_permutation_challenges: Vec<SC::Challenge> = Vec::new();
-        for _ in 0..2 {
-            global_permutation_challenges.push(challenger.sample_ext_element());
-        }
-
         // Spawn the phase 2 record generator thread.
         let p2_record_gen_sync = Arc::new(TurnBasedSync::new());
         let p2_trace_gen_sync = Arc::new(TurnBasedSync::new());
         let (p2_records_and_traces_tx, p2_records_and_traces_rx) =
-            sync_channel::<(
-                Vec<ExecutionRecord>,
-                (
-                    Vec<Vec<(String, RowMajorMatrix<Val<SC>>)>>,
-                    Vec<Vec<(String, RowMajorMatrix<Val<SC>>)>>,
-                ),
-            )>(opts.records_and_traces_channel_capacity);
+            sync_channel::<(Vec<ExecutionRecord>, Vec<Vec<(String, RowMajorMatrix<Val<SC>>)>>)>(
+                opts.records_and_traces_channel_capacity,
+            );
         let p2_records_and_traces_tx = Arc::new(Mutex::new(p2_records_and_traces_tx));
 
+        let shape_tx = Arc::new(Mutex::new(shape_and_done_tx));
         let report_aggregate = Arc::new(Mutex::new(ExecutionReport::default()));
         let state = Arc::new(Mutex::new(PublicValues::<u32, u32>::default().reset()));
         let deferred = Arc::new(Mutex::new(ExecutionRecord::new(program.clone().into())));
         let mut p2_record_and_trace_gen_handles = Vec::new();
+        let checkpoints_rx = Arc::new(Mutex::new(checkpoints_rx));
         for _ in 0..opts.trace_gen_workers {
             let record_gen_sync = Arc::clone(&p2_record_gen_sync);
             let trace_gen_sync = Arc::clone(&p2_trace_gen_sync);
             let records_and_traces_tx = Arc::clone(&p2_records_and_traces_tx);
+            let checkpoints_rx = Arc::clone(&checkpoints_rx);
 
+            let shape_tx = Arc::clone(&shape_tx);
             let report_aggregate = Arc::clone(&report_aggregate);
-            let checkpoints = Arc::clone(&checkpoints);
             let state = Arc::clone(&state);
             let deferred = Arc::clone(&deferred);
             let program = program.clone();
-
             let span = tracing::Span::current().clone();
 
             #[cfg(feature = "debug")]
@@ -468,10 +195,8 @@ where
                 let _span = span.enter();
                 tracing::debug_span!("phase 2 trace generation").in_scope(|| {
                     loop {
-                        // Receive the latest checkpoint.
-                        let received = { checkpoints.lock().unwrap().pop_front() };
-                        if let Some((index, mut checkpoint, done)) = received {
-                            // Trace the checkpoint and reconstruct the execution records.
+                        let received = { checkpoints_rx.lock().unwrap().recv() };
+                        if let Ok((index, mut checkpoint, done, num_cycles)) = received {
                             let (mut records, report) = tracing::debug_span!("trace checkpoint")
                                 .in_scope(|| {
                                     trace_checkpoint::<SC>(
@@ -481,9 +206,13 @@ where
                                         shape_config,
                                     )
                                 });
-                            log::debug!("generated {} records", records.len());
+
+                            // Trace the checkpoint and reconstruct the execution records.
+                            log::info!("generated {} records", records.len());
                             *report_aggregate.lock().unwrap() += report;
-                            reset_seek(&mut checkpoint);
+                            checkpoint
+                                .seek(SeekFrom::Start(0))
+                                .expect("failed to seek to start of tempfile");
 
                             // Wait for our turn to update the state.
                             record_gen_sync.wait_for_turn(index);
@@ -503,15 +232,34 @@ where
                                 record.public_values = *state;
                             }
 
+                            tracing::info!("Records length:{}, done: {}", records.len(), done);
+
                             // Defer events that are too expensive to include in every shard.
                             let mut deferred = deferred.lock().unwrap();
                             for record in records.iter_mut() {
                                 deferred.append(&mut record.defer());
                             }
 
+                            // tracing::info!("Deferred length: {}", deferred.len());
+
+                            let last_record = if done
+                                && num_cycles < 1 << 26
+                                && deferred.global_memory_initialize_events.len()
+                                    < opts.split_opts.memory / 4
+                                && deferred.global_memory_finalize_events.len()
+                                    < opts.split_opts.memory / 4
+                            {
+                                tracing::info!("Number of cycles: {}", num_cycles);
+                                records.last_mut()
+                            } else {
+                                None
+                            };
+
+                            tracing::info!("Last record is some: {:?}", last_record.is_some());
+
                             // See if any deferred shards are ready to be committed to.
-                            let mut deferred = deferred.split(done, opts.split_opts);
-                            log::debug!("deferred {} records", deferred.len());
+                            let mut deferred = deferred.split(done, last_record, opts.split_opts);
+                            log::info!("deferred {} records", deferred.len());
 
                             // Update the public values & prover state for the shards which do not
                             // contain "cpu events" before committing to them.
@@ -548,27 +296,31 @@ where
                                 }
                             }
 
+                            // Send the shapes to the channel, if necessary.
+                            for record in records.iter() {
+                                let mut heights = vec![];
+                                let chips = prover.shard_chips(record).collect::<Vec<_>>();
+                                if let Some(shape) = record.shape.as_ref() {
+                                    for chip in chips.iter() {
+                                        let height = shape.inner[&chip.name()];
+                                        heights.push((chip.name().clone(), height));
+                                    }
+                                    shape_tx
+                                        .lock()
+                                        .unwrap()
+                                        .send((ProofShape::from_log2_heights(&heights), done))
+                                        .unwrap();
+                                }
+                            }
+
                             #[cfg(feature = "debug")]
                             all_records_tx.send(records.clone()).unwrap();
 
-                            // Generate the traces.
-                            let mut local_traces = Vec::new();
-                            tracing::debug_span!("generate local traces", index).in_scope(|| {
-                                local_traces = records
+                            let mut main_traces = Vec::new();
+                            tracing::debug_span!("generate main traces", index).in_scope(|| {
+                                main_traces = records
                                     .par_iter()
-                                    .map(|record| {
-                                        prover.generate_traces(record, InteractionScope::Local)
-                                    })
-                                    .collect::<Vec<_>>();
-                            });
-
-                            let mut global_traces = Vec::new();
-                            tracing::debug_span!("generate global traces", index).in_scope(|| {
-                                global_traces = records
-                                    .par_iter()
-                                    .map(|record| {
-                                        prover.generate_traces(record, InteractionScope::Global)
-                                    })
+                                    .map(|record| prover.generate_traces(record))
                                     .collect::<Vec<_>>();
                             });
 
@@ -576,19 +328,15 @@ where
 
                             // Send the records to the phase 2 prover.
                             let chunked_records = chunk_vec(records, opts.shard_batch_size);
-                            let chunked_global_traces =
-                                chunk_vec(global_traces, opts.shard_batch_size);
-                            let chunked_local_traces =
-                                chunk_vec(local_traces, opts.shard_batch_size);
+                            let chunked_main_traces = chunk_vec(main_traces, opts.shard_batch_size);
                             chunked_records
                                 .into_iter()
-                                .zip(chunked_global_traces.into_iter())
-                                .zip(chunked_local_traces.into_iter())
-                                .for_each(|((records, global_traces), local_traces)| {
+                                .zip(chunked_main_traces.into_iter())
+                                .for_each(|(records, main_traces)| {
                                     records_and_traces_tx
                                         .lock()
                                         .unwrap()
-                                        .send((records, (global_traces, local_traces)))
+                                        .send((records, main_traces))
                                         .unwrap();
                                 });
 
@@ -607,63 +355,62 @@ where
 
         // Spawn the phase 2 prover thread.
         let p2_prover_span = tracing::Span::current().clone();
+        let proof_tx = Arc::new(Mutex::new(proof_tx));
         let p2_prover_handle = s.spawn(move || {
             let _span = p2_prover_span.enter();
-            let mut shard_proofs = Vec::new();
             tracing::debug_span!("phase 2 prover").in_scope(|| {
                 for (records, traces) in p2_records_and_traces_rx.into_iter() {
                     tracing::debug_span!("batch").in_scope(|| {
                         let span = tracing::Span::current().clone();
-                        shard_proofs.par_extend(
-                            records.into_par_iter().zip(traces.into_par_iter()).map(
-                                |(record, (global_traces, local_traces))| {
-                                    let _span = span.enter();
-
-                                    let global_commit_span =
-                                        tracing::debug_span!("commit to global traces").entered();
-                                    let global_data = prover.commit(&record, global_traces);
-                                    global_commit_span.exit();
-                                    let local_commit_span =
-                                        tracing::debug_span!("commit to local traces").entered();
-                                    let local_data = prover.commit(&record, local_traces);
-                                    local_commit_span.exit();
-
-                                    let opening_span = tracing::debug_span!("opening").entered();
-                                    let proof = prover
-                                        .open(
-                                            pk,
-                                            Some(global_data),
-                                            local_data,
-                                            &mut challenger.clone(),
-                                            &global_permutation_challenges,
-                                        )
-                                        .unwrap();
-                                    opening_span.exit();
-
-                                    #[cfg(debug_assertions)]
-                                    {
-                                        if let Some(shape) = record.shape {
-                                            assert_eq!(
-                                                proof.shape(),
-                                                shape.clone().into_iter().collect(),
-                                            );
-                                        }
+                        let proofs = records
+                            .into_par_iter()
+                            .zip(traces.into_par_iter())
+                            .map(|(record, main_traces)| {
+                                let _span = span.enter();
+
+                                let main_data = prover.commit(&record, main_traces);
+
+                                let opening_span = tracing::debug_span!("opening").entered();
+                                let proof =
+                                    prover.open(pk, main_data, &mut challenger.clone()).unwrap();
+                                opening_span.exit();
+
+                                #[cfg(debug_assertions)]
+                                {
+                                    if let Some(shape) = record.shape.as_ref() {
+                                        assert_eq!(
+                                            proof.shape(),
+                                            shape.clone().into_iter().collect(),
+                                        );
                                     }
-                                    proof
-                                },
-                            ),
-                        );
+                                }
+
+                                rayon::spawn(move || {
+                                    drop(record);
+                                });
+
+                                proof
+                            })
+                            .collect::<Vec<_>>();
+
+                        // Send the batch of proofs to the channel.
+                        let proof_tx = proof_tx.lock().unwrap();
+                        for proof in proofs {
+                            proof_tx.send(proof).unwrap();
+                        }
                     });
                 }
             });
-            shard_proofs
         });
 
+        // Wait until the checkpoint generator handle has fully finished.
+        let public_values_stream = checkpoint_generator_handle.join().unwrap().unwrap();
+
         // Wait until the records and traces have been fully generated for phase 2.
         p2_record_and_trace_gen_handles.into_iter().for_each(|handle| handle.join().unwrap());
 
         // Wait until the phase 2 prover has finished.
-        let shard_proofs = p2_prover_handle.join().unwrap();
+        p2_prover_handle.join().unwrap();
 
         // Log some of the `ExecutionReport` information.
         let report_aggregate = report_aggregate.lock().unwrap();
@@ -696,18 +443,15 @@ where
             }
         }
 
-        let proof = MachineProof::<SC> { shard_proofs };
         let cycles = report_aggregate.total_instruction_count();
 
         // Print the summary.
         let proving_time = proving_start.elapsed().as_secs_f64();
         tracing::info!(
-            "summary: cycles={}, gas={}, e2e={}s, khz={:.2}, proofSize={}",
+            "summary: cycles={}, e2e={}s, khz={:.2}",
             cycles,
-            report_aggregate.estimate_gas(),
             proving_time,
             (cycles as f64 / (proving_time * 1000.0) as f64),
-            bincode::serialize(&proof).unwrap().len(),
         );
 
         #[cfg(feature = "debug")]
@@ -718,143 +462,11 @@ where
             prover.machine().debug_constraints(&pk_host, all_records, &mut challenger);
         }
 
-        Ok((proof, public_values_stream, cycles))
+        Ok((public_values_stream, cycles))
     })
 }
 
-/// Runs a program and returns the public values stream.
-pub fn run_test_io<P: MachineProver<BabyBearPoseidon2, RiscvAir<BabyBear>>>(
-    mut program: Program,
-    inputs: SP1Stdin,
-) -> Result<SP1PublicValues, MachineVerificationError<BabyBearPoseidon2>> {
-    let shape_config = CoreShapeConfig::<BabyBear>::default();
-    shape_config.fix_preprocessed_shape(&mut program).unwrap();
-    let runtime = tracing::debug_span!("runtime.run(...)").in_scope(|| {
-        let mut runtime = Executor::new(program, SP1CoreOpts::default());
-        runtime.maximal_shapes =
-            Some(shape_config.maximal_core_shapes().into_iter().map(|s| s.inner).collect());
-        runtime.write_vecs(&inputs.buffer);
-        runtime.run().unwrap();
-        runtime
-    });
-    let public_values = SP1PublicValues::from(&runtime.state.public_values_stream);
-
-    let _ = run_test_core::<P>(runtime, inputs, Some(&shape_config))?;
-    Ok(public_values)
-}
-
-pub fn run_test<P: MachineProver<BabyBearPoseidon2, RiscvAir<BabyBear>>>(
-    mut program: Program,
-) -> Result<MachineProof<BabyBearPoseidon2>, MachineVerificationError<BabyBearPoseidon2>> {
-    let shape_config = CoreShapeConfig::default();
-    shape_config.fix_preprocessed_shape(&mut program).unwrap();
-    let runtime = tracing::debug_span!("runtime.run(...)").in_scope(|| {
-        let mut runtime = Executor::new(program, SP1CoreOpts::default());
-        runtime.maximal_shapes =
-            Some(shape_config.maximal_core_shapes().into_iter().map(|s| s.inner).collect());
-        runtime.run().unwrap();
-        runtime
-    });
-    run_test_core::<P>(runtime, SP1Stdin::new(), Some(&shape_config))
-}
-
-#[allow(unused_variables)]
-pub fn run_test_core<P: MachineProver<BabyBearPoseidon2, RiscvAir<BabyBear>>>(
-    runtime: Executor,
-    inputs: SP1Stdin,
-    shape_config: Option<&CoreShapeConfig<BabyBear>>,
-) -> Result<MachineProof<BabyBearPoseidon2>, MachineVerificationError<BabyBearPoseidon2>> {
-    let config = BabyBearPoseidon2::new();
-    let machine = RiscvAir::machine(config);
-    let prover = P::new(machine);
-
-    let (pk, _) = prover.setup(runtime.program.as_ref());
-    let (proof, output, _) = prove_with_context(
-        &prover,
-        &pk,
-        Program::clone(&runtime.program),
-        &inputs,
-        SP1CoreOpts::default(),
-        SP1Context::default(),
-        shape_config,
-    )
-    .unwrap();
-
-    let config = BabyBearPoseidon2::new();
-    let machine = RiscvAir::machine(config);
-    let (pk, vk) = machine.setup(runtime.program.as_ref());
-    let mut challenger = machine.config().challenger();
-    machine.verify(&vk, &proof, &mut challenger).unwrap();
-
-    Ok(proof)
-}
-
-#[allow(unused_variables)]
-pub fn run_test_machine_with_prover<SC, A, P: MachineProver<SC, A>>(
-    prover: &P,
-    records: Vec<A::Record>,
-    pk: P::DeviceProvingKey,
-    vk: StarkVerifyingKey<SC>,
-) -> Result<MachineProof<SC>, MachineVerificationError<SC>>
-where
-    A: MachineAir<SC::Val>
-        + Air<InteractionBuilder<Val<SC>>>
-        + for<'a> Air<VerifierConstraintFolder<'a, SC>>
-        + for<'a> Air<DebugConstraintBuilder<'a, Val<SC>, SC::Challenge>>,
-    A::Record: MachineRecord<Config = SP1CoreOpts>,
-    SC: StarkGenericConfig,
-    SC::Val: p3_field::PrimeField32,
-    SC::Challenger: Clone,
-    Com<SC>: Send + Sync,
-    PcsProverData<SC>: Send + Sync + Serialize + DeserializeOwned,
-    OpeningProof<SC>: Send + Sync,
-{
-    let mut challenger = prover.config().challenger();
-    let prove_span = tracing::debug_span!("prove").entered();
-
-    #[cfg(feature = "debug")]
-    prover.machine().debug_constraints(
-        &prover.pk_to_host(&pk),
-        records.clone(),
-        &mut challenger.clone(),
-    );
-
-    let proof = prover.prove(&pk, records, &mut challenger, SP1CoreOpts::default()).unwrap();
-    prove_span.exit();
-    let nb_bytes = bincode::serialize(&proof).unwrap().len();
-
-    let mut challenger = prover.config().challenger();
-    prover.machine().verify(&vk, &proof, &mut challenger)?;
-
-    Ok(proof)
-}
-
-#[allow(unused_variables)]
-pub fn run_test_machine<SC, A>(
-    records: Vec<A::Record>,
-    machine: StarkMachine<SC, A>,
-    pk: StarkProvingKey<SC>,
-    vk: StarkVerifyingKey<SC>,
-) -> Result<MachineProof<SC>, MachineVerificationError<SC>>
-where
-    A: MachineAir<SC::Val>
-        + for<'a> Air<ProverConstraintFolder<'a, SC>>
-        + Air<InteractionBuilder<Val<SC>>>
-        + for<'a> Air<VerifierConstraintFolder<'a, SC>>
-        + for<'a> Air<DebugConstraintBuilder<'a, Val<SC>, SC::Challenge>>,
-    A::Record: MachineRecord<Config = SP1CoreOpts>,
-    SC: StarkGenericConfig,
-    SC::Val: p3_field::PrimeField32,
-    SC::Challenger: Clone,
-    Com<SC>: Send + Sync,
-    PcsProverData<SC>: Send + Sync + Serialize + DeserializeOwned,
-    OpeningProof<SC>: Send + Sync,
-{
-    let prover = CpuProver::new(machine);
-    run_test_machine_with_prover::<SC, A, CpuProver<_, _>>(&prover, records, pk, vk)
-}
-
-fn trace_checkpoint<SC: StarkGenericConfig>(
+pub fn trace_checkpoint<SC: StarkGenericConfig>(
     program: Program,
     file: &File,
     opts: SP1CoreOpts,
@@ -880,74 +492,12 @@ where
     (records, runtime.report)
 }
 
-fn reset_seek(file: &mut File) {
-    file.seek(std::io::SeekFrom::Start(0)).expect("failed to seek to start of tempfile");
-}
-
-#[cfg(debug_assertions)]
-#[cfg(not(doctest))]
-pub fn uni_stark_prove<SC, A>(
-    config: &SC,
-    air: &A,
-    challenger: &mut SC::Challenger,
-    trace: RowMajorMatrix<SC::Val>,
-) -> Proof<UniConfig<SC>>
-where
-    SC: StarkGenericConfig,
-    A: Air<p3_uni_stark::SymbolicAirBuilder<SC::Val>>
-        + for<'a> Air<p3_uni_stark::ProverConstraintFolder<'a, UniConfig<SC>>>
-        + for<'a> Air<p3_uni_stark::DebugConstraintBuilder<'a, SC::Val>>,
-{
-    p3_uni_stark::prove(&UniConfig(config.clone()), air, challenger, trace, &vec![])
-}
-
-#[cfg(not(debug_assertions))]
-pub fn uni_stark_prove<SC, A>(
-    config: &SC,
-    air: &A,
-    challenger: &mut SC::Challenger,
-    trace: RowMajorMatrix<SC::Val>,
-) -> Proof<UniConfig<SC>>
-where
-    SC: StarkGenericConfig,
-    A: Air<p3_uni_stark::SymbolicAirBuilder<SC::Val>>
-        + for<'a> Air<p3_uni_stark::ProverConstraintFolder<'a, UniConfig<SC>>>,
-{
-    p3_uni_stark::prove(&UniConfig(config.clone()), air, challenger, trace, &vec![])
-}
-
-#[cfg(debug_assertions)]
-#[cfg(not(doctest))]
-pub fn uni_stark_verify<SC, A>(
-    config: &SC,
-    air: &A,
-    challenger: &mut SC::Challenger,
-    proof: &Proof<UniConfig<SC>>,
-) -> Result<(), p3_uni_stark::VerificationError>
-where
-    SC: StarkGenericConfig,
-    A: Air<p3_uni_stark::SymbolicAirBuilder<SC::Val>>
-        + for<'a> Air<p3_uni_stark::VerifierConstraintFolder<'a, UniConfig<SC>>>
-        + for<'a> Air<p3_uni_stark::DebugConstraintBuilder<'a, SC::Val>>,
-{
-    p3_uni_stark::verify(&UniConfig(config.clone()), air, challenger, proof, &vec![])
-}
-
-#[cfg(not(debug_assertions))]
-pub fn uni_stark_verify<SC, A>(
-    config: &SC,
-    air: &A,
-    challenger: &mut SC::Challenger,
-    proof: &Proof<UniConfig<SC>>,
-) -> Result<(), p3_uni_stark::VerificationError>
-where
-    SC: StarkGenericConfig,
-    A: Air<p3_uni_stark::SymbolicAirBuilder<SC::Val>>
-        + for<'a> Air<p3_uni_stark::VerifierConstraintFolder<'a, UniConfig<SC>>>,
-{
-    p3_uni_stark::verify(&UniConfig(config.clone()), air, challenger, proof, &vec![])
+#[derive(Error, Debug)]
+pub enum SP1CoreProverError {
+    #[error("failed to execute program: {0}")]
+    ExecutionError(ExecutionError),
+    #[error("io error: {0}")]
+    IoError(io::Error),
+    #[error("serialization error: {0}")]
+    SerializationError(bincode::Error),
 }
-
-use p3_air::Air;
-use p3_matrix::dense::RowMajorMatrix;
-use p3_uni_stark::Proof;
diff --git a/crates/core/machine/src/utils/test.rs b/crates/core/machine/src/utils/test.rs
new file mode 100644
index 0000000000..f18509a601
--- /dev/null
+++ b/crates/core/machine/src/utils/test.rs
@@ -0,0 +1,138 @@
+use p3_air::Air;
+use p3_baby_bear::BabyBear;
+use serde::{de::DeserializeOwned, Serialize};
+use sp1_core_executor::{Executor, Program, SP1Context};
+use sp1_primitives::io::SP1PublicValues;
+use sp1_stark::{
+    air::MachineAir, baby_bear_poseidon2::BabyBearPoseidon2, Com, CpuProver,
+    DebugConstraintBuilder, InteractionBuilder, MachineProof, MachineProver, MachineRecord,
+    MachineVerificationError, OpeningProof, PcsProverData, ProverConstraintFolder, SP1CoreOpts,
+    StarkGenericConfig, StarkMachine, StarkProvingKey, StarkVerifyingKey, Val,
+    VerifierConstraintFolder,
+};
+
+use crate::{
+    io::SP1Stdin,
+    riscv::{CoreShapeConfig, RiscvAir},
+};
+
+use super::prove_core;
+
+/// The canonical entry point for testing a [`Program`] and [`SP1Stdin`] with a [`MachineProver`].
+pub fn run_test<P: MachineProver<BabyBearPoseidon2, RiscvAir<BabyBear>>>(
+    mut program: Program,
+    inputs: SP1Stdin,
+) -> Result<SP1PublicValues, MachineVerificationError<BabyBearPoseidon2>> {
+    let shape_config = CoreShapeConfig::<BabyBear>::default();
+    shape_config.fix_preprocessed_shape(&mut program).unwrap();
+
+    let runtime = tracing::debug_span!("runtime.run(...)").in_scope(|| {
+        let mut runtime = Executor::new(program, SP1CoreOpts::default());
+        runtime.maximal_shapes =
+            Some(shape_config.maximal_core_shapes().into_iter().map(|s| s.inner).collect());
+        runtime.write_vecs(&inputs.buffer);
+        runtime.run().unwrap();
+        runtime
+    });
+    let public_values = SP1PublicValues::from(&runtime.state.public_values_stream);
+
+    let _ = run_test_core::<P>(runtime, inputs, Some(&shape_config))?;
+    Ok(public_values)
+}
+
+#[allow(unused_variables)]
+pub fn run_test_core<P: MachineProver<BabyBearPoseidon2, RiscvAir<BabyBear>>>(
+    runtime: Executor,
+    inputs: SP1Stdin,
+    shape_config: Option<&CoreShapeConfig<BabyBear>>,
+) -> Result<MachineProof<BabyBearPoseidon2>, MachineVerificationError<BabyBearPoseidon2>> {
+    let config = BabyBearPoseidon2::new();
+    let machine = RiscvAir::machine(config);
+    let prover = P::new(machine);
+
+    let (pk, vk) = prover.setup(runtime.program.as_ref());
+    let (proof, output, _) = prove_core(
+        &prover,
+        &pk,
+        &vk,
+        Program::clone(&runtime.program),
+        &inputs,
+        SP1CoreOpts::default(),
+        SP1Context::default(),
+        shape_config,
+    )
+    .unwrap();
+
+    let config = BabyBearPoseidon2::new();
+    let machine = RiscvAir::machine(config);
+    let (pk, vk) = machine.setup(runtime.program.as_ref());
+    let mut challenger = machine.config().challenger();
+    machine.verify(&vk, &proof, &mut challenger).unwrap();
+
+    Ok(proof)
+}
+
+#[allow(unused_variables)]
+pub fn run_test_machine_with_prover<SC, A, P: MachineProver<SC, A>>(
+    prover: &P,
+    records: Vec<A::Record>,
+    pk: P::DeviceProvingKey,
+    vk: StarkVerifyingKey<SC>,
+) -> Result<MachineProof<SC>, MachineVerificationError<SC>>
+where
+    A: MachineAir<SC::Val>
+        + Air<InteractionBuilder<Val<SC>>>
+        + for<'a> Air<VerifierConstraintFolder<'a, SC>>
+        + for<'a> Air<DebugConstraintBuilder<'a, Val<SC>, SC::Challenge>>,
+    A::Record: MachineRecord<Config = SP1CoreOpts>,
+    SC: StarkGenericConfig,
+    SC::Val: p3_field::PrimeField32,
+    SC::Challenger: Clone,
+    Com<SC>: Send + Sync,
+    PcsProverData<SC>: Send + Sync + Serialize + DeserializeOwned,
+    OpeningProof<SC>: Send + Sync,
+{
+    let mut challenger = prover.config().challenger();
+    let prove_span = tracing::debug_span!("prove").entered();
+
+    #[cfg(feature = "debug")]
+    prover.machine().debug_constraints(
+        &prover.pk_to_host(&pk),
+        records.clone(),
+        &mut challenger.clone(),
+    );
+
+    let proof = prover.prove(&pk, records, &mut challenger, SP1CoreOpts::default()).unwrap();
+    prove_span.exit();
+    let nb_bytes = bincode::serialize(&proof).unwrap().len();
+
+    let mut challenger = prover.config().challenger();
+    prover.machine().verify(&vk, &proof, &mut challenger)?;
+
+    Ok(proof)
+}
+
+#[allow(unused_variables)]
+pub fn run_test_machine<SC, A>(
+    records: Vec<A::Record>,
+    machine: StarkMachine<SC, A>,
+    pk: StarkProvingKey<SC>,
+    vk: StarkVerifyingKey<SC>,
+) -> Result<MachineProof<SC>, MachineVerificationError<SC>>
+where
+    A: MachineAir<SC::Val>
+        + for<'a> Air<ProverConstraintFolder<'a, SC>>
+        + Air<InteractionBuilder<Val<SC>>>
+        + for<'a> Air<VerifierConstraintFolder<'a, SC>>
+        + for<'a> Air<DebugConstraintBuilder<'a, Val<SC>, SC::Challenge>>,
+    A::Record: MachineRecord<Config = SP1CoreOpts>,
+    SC: StarkGenericConfig,
+    SC::Val: p3_field::PrimeField32,
+    SC::Challenger: Clone,
+    Com<SC>: Send + Sync,
+    PcsProverData<SC>: Send + Sync + Serialize + DeserializeOwned,
+    OpeningProof<SC>: Send + Sync,
+{
+    let prover = CpuProver::new(machine);
+    run_test_machine_with_prover::<SC, A, CpuProver<_, _>>(&prover, records, pk, vk)
+}
diff --git a/crates/core/machine/src/utils/tracer.rs b/crates/core/machine/src/utils/tracer.rs
deleted file mode 100644
index 88a4c7f6ac..0000000000
--- a/crates/core/machine/src/utils/tracer.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-use std::env;
-
-use tracing::level_filters::LevelFilter;
-use tracing_forest::ForestLayer;
-use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Registry};
-
-/// A tracer to benchmark the performance of the vm.
-///
-/// Set the `RUST_TRACER` environment variable to be set to `info` or `debug`.
-/// ! DEPRECATED: don't use this function, use `setup_logger` instead.
-pub fn setup_tracer() {
-    let tracer_config = env::var("RUST_TRACER").unwrap_or_else(|_| "none".to_string());
-    let mut env_filter = EnvFilter::builder()
-        .with_default_directive(LevelFilter::OFF.into())
-        .with_default_directive("log::=off".parse().unwrap())
-        .from_env_lossy();
-    if tracer_config == "info" {
-        env_filter = env_filter.add_directive("sp1_core=info".parse().unwrap());
-    } else if tracer_config == "debug" {
-        env_filter = env_filter.add_directive("sp1_core=debug".parse().unwrap());
-    }
-    Registry::default().with(env_filter).with(ForestLayer::default()).init();
-}
diff --git a/crates/core/machine/src/utils/uni_stark.rs b/crates/core/machine/src/utils/uni_stark.rs
new file mode 100644
index 0000000000..c7d510c95d
--- /dev/null
+++ b/crates/core/machine/src/utils/uni_stark.rs
@@ -0,0 +1,68 @@
+use p3_air::Air;
+use p3_matrix::dense::RowMajorMatrix;
+use p3_uni_stark::Proof;
+use sp1_stark::{StarkGenericConfig, UniConfig};
+
+#[cfg(debug_assertions)]
+#[cfg(not(doctest))]
+pub fn uni_stark_prove<SC, A>(
+    config: &SC,
+    air: &A,
+    challenger: &mut SC::Challenger,
+    trace: RowMajorMatrix<SC::Val>,
+) -> Proof<UniConfig<SC>>
+where
+    SC: StarkGenericConfig,
+    A: Air<p3_uni_stark::SymbolicAirBuilder<SC::Val>>
+        + for<'a> Air<p3_uni_stark::ProverConstraintFolder<'a, UniConfig<SC>>>
+        + for<'a> Air<p3_uni_stark::DebugConstraintBuilder<'a, SC::Val>>,
+{
+    p3_uni_stark::prove(&UniConfig(config.clone()), air, challenger, trace, &vec![])
+}
+
+#[cfg(not(debug_assertions))]
+pub fn uni_stark_prove<SC, A>(
+    config: &SC,
+    air: &A,
+    challenger: &mut SC::Challenger,
+    trace: RowMajorMatrix<SC::Val>,
+) -> Proof<UniConfig<SC>>
+where
+    SC: StarkGenericConfig,
+    A: Air<p3_uni_stark::SymbolicAirBuilder<SC::Val>>
+        + for<'a> Air<p3_uni_stark::ProverConstraintFolder<'a, UniConfig<SC>>>,
+{
+    p3_uni_stark::prove(&UniConfig(config.clone()), air, challenger, trace, &vec![])
+}
+
+#[cfg(debug_assertions)]
+#[cfg(not(doctest))]
+pub fn uni_stark_verify<SC, A>(
+    config: &SC,
+    air: &A,
+    challenger: &mut SC::Challenger,
+    proof: &Proof<UniConfig<SC>>,
+) -> Result<(), p3_uni_stark::VerificationError>
+where
+    SC: StarkGenericConfig,
+    A: Air<p3_uni_stark::SymbolicAirBuilder<SC::Val>>
+        + for<'a> Air<p3_uni_stark::VerifierConstraintFolder<'a, UniConfig<SC>>>
+        + for<'a> Air<p3_uni_stark::DebugConstraintBuilder<'a, SC::Val>>,
+{
+    p3_uni_stark::verify(&UniConfig(config.clone()), air, challenger, proof, &vec![])
+}
+
+#[cfg(not(debug_assertions))]
+pub fn uni_stark_verify<SC, A>(
+    config: &SC,
+    air: &A,
+    challenger: &mut SC::Challenger,
+    proof: &Proof<UniConfig<SC>>,
+) -> Result<(), p3_uni_stark::VerificationError>
+where
+    SC: StarkGenericConfig,
+    A: Air<p3_uni_stark::SymbolicAirBuilder<SC::Val>>
+        + for<'a> Air<p3_uni_stark::VerifierConstraintFolder<'a, UniConfig<SC>>>,
+{
+    p3_uni_stark::verify(&UniConfig(config.clone()), air, challenger, proof, &vec![])
+}
diff --git a/crates/cuda/proto/api.proto b/crates/cuda/proto/api.proto
index c5e078676b..ec93419428 100644
--- a/crates/cuda/proto/api.proto
+++ b/crates/cuda/proto/api.proto
@@ -3,6 +3,7 @@ syntax = "proto3";
 package api;
 
 service ProverService {
+    rpc Setup(SetupRequest) returns (SetupResponse) {}
     rpc Ready(ReadyRequest) returns (ReadyResponse) {}
     rpc ProveCore(ProveCoreRequest) returns (ProveCoreResponse) {}
     rpc Compress(CompressRequest) returns (CompressResponse) {}
@@ -16,6 +17,14 @@ message ReadyResponse {
     bool ready = 1;
 }
 
+message SetupRequest {
+    bytes data = 1;
+}
+
+message SetupResponse {
+    bytes result = 1;
+}
+
 message ProveCoreRequest {
     bytes data = 1;
 }
diff --git a/crates/cuda/src/lib.rs b/crates/cuda/src/lib.rs
index 3493f6a58c..5f04f1cd10 100644
--- a/crates/cuda/src/lib.rs
+++ b/crates/cuda/src/lib.rs
@@ -17,7 +17,7 @@ use reqwest::{Request, Response};
 use serde::{Deserialize, Serialize};
 use sp1_core_machine::{io::SP1Stdin, reduce::SP1ReduceProof, utils::SP1CoreProverError};
 use sp1_prover::{
-    types::SP1ProvingKey, InnerSC, OuterSC, SP1CoreProof, SP1RecursionProverError, SP1VerifyingKey,
+    InnerSC, OuterSC, SP1CoreProof, SP1ProvingKey, SP1RecursionProverError, SP1VerifyingKey,
 };
 use tokio::task::block_in_place;
 use twirp::{
@@ -46,13 +46,28 @@ pub struct SP1CudaProver {
     cleaned_up: Arc<AtomicBool>,
 }
 
+/// The payload for the [sp1_prover::SP1Prover::setup] method.
+///
+/// We use this object to serialize and deserialize the payload from the client to the server.
+#[derive(Serialize, Deserialize)]
+pub struct SetupRequestPayload {
+    pub elf: Vec<u8>,
+}
+
+/// The payload for the [sp1_prover::SP1Prover::setup] method response.
+///
+/// We use this object to serialize and deserialize the payload from the server to the client.
+#[derive(Serialize, Deserialize)]
+pub struct SetupResponsePayload {
+    pub pk: SP1ProvingKey,
+    pub vk: SP1VerifyingKey,
+}
+
 /// The payload for the [sp1_prover::SP1Prover::prove_core] method.
 ///
 /// We use this object to serialize and deserialize the payload from the client to the server.
 #[derive(Serialize, Deserialize)]
 pub struct ProveCoreRequestPayload {
-    /// The proving key.
-    pub pk: SP1ProvingKey,
     /// The input stream.
     pub stdin: SP1Stdin,
 }
@@ -91,7 +106,8 @@ impl SP1CudaProver {
     /// [SP1ProverClient] that can be used to communicate with the container.
     pub fn new() -> Result<Self, Box<dyn StdError>> {
         let container_name = "sp1-gpu";
-        let image_name = "public.ecr.aws/succinct-labs/sp1-gpu:7e66232";
+        let image_name = std::env::var("SP1_GPU_IMAGE")
+            .unwrap_or_else(|_| "public.ecr.aws/succinct-labs/moongate:v4.0.0-rc2".to_string());
 
         let cleaned_up = Arc::new(AtomicBool::new(false));
         let cleanup_name = container_name;
@@ -103,7 +119,7 @@ impl SP1CudaProver {
         }
 
         // Pull the docker image if it's not present
-        if let Err(e) = Command::new("docker").args(["pull", image_name]).output() {
+        if let Err(e) = Command::new("docker").args(["pull", &image_name]).output() {
             return Err(format!("Failed to pull Docker image: {}. Please check your internet connection and Docker permissions.", e).into());
         }
 
@@ -114,6 +130,10 @@ impl SP1CudaProver {
                 "run",
                 "-e",
                 &format!("RUST_LOG={}", rust_log_level),
+                "-e",
+                "FIX_RECURSION_SHAPES=false",
+                "-e",
+                "FIX_CORE_SHAPES=false",
                 "-p",
                 "3000:3000",
                 "--rm",
@@ -121,7 +141,7 @@ impl SP1CudaProver {
                 "all",
                 "--name",
                 container_name,
-                image_name,
+                &image_name,
             ])
             .stdout(Stdio::piped())
             .stderr(Stdio::piped())
@@ -229,17 +249,21 @@ impl SP1CudaProver {
         }
     }
 
+    /// Executes the [sp1_prover::SP1Prover::setup] method inside the container.
+    pub fn setup(&self, elf: &[u8]) -> Result<(SP1ProvingKey, SP1VerifyingKey), Box<dyn StdError>> {
+        let payload = SetupRequestPayload { elf: elf.to_vec() };
+        let request =
+            crate::proto::api::SetupRequest { data: bincode::serialize(&payload).unwrap() };
+        let response = block_on(async { self.client.setup(request).await }).unwrap();
+        let payload: SetupResponsePayload = bincode::deserialize(&response.result).unwrap();
+        Ok((payload.pk, payload.vk))
+    }
+
     /// Executes the [sp1_prover::SP1Prover::prove_core] method inside the container.
     ///
     /// You will need at least 24GB of VRAM to run this method.
-    ///
-    /// **WARNING**: This is an experimental feature and may not work as expected.
-    pub fn prove_core(
-        &self,
-        pk: &SP1ProvingKey,
-        stdin: &SP1Stdin,
-    ) -> Result<SP1CoreProof, SP1CoreProverError> {
-        let payload = ProveCoreRequestPayload { pk: pk.clone(), stdin: stdin.clone() };
+    pub fn prove_core(&self, stdin: &SP1Stdin) -> Result<SP1CoreProof, SP1CoreProverError> {
+        let payload = ProveCoreRequestPayload { stdin: stdin.clone() };
         let request =
             crate::proto::api::ProveCoreRequest { data: bincode::serialize(&payload).unwrap() };
         let response = block_on(async { self.client.prove_core(request).await }).unwrap();
@@ -250,8 +274,6 @@ impl SP1CudaProver {
     /// Executes the [sp1_prover::SP1Prover::compress] method inside the container.
     ///
     /// You will need at least 24GB of VRAM to run this method.
-    ///
-    /// **WARNING**: This is an experimental feature and may not work as expected.
     pub fn compress(
         &self,
         vk: &SP1VerifyingKey,
@@ -269,9 +291,7 @@ impl SP1CudaProver {
 
     /// Executes the [sp1_prover::SP1Prover::shrink] method inside the container.
     ///
-    /// You will need at least 40GB of VRAM to run this method.
-    ///
-    /// **WARNING**: This is an experimental feature and may not work as expected.
+    /// You will need at least 24GB of VRAM to run this method.
     pub fn shrink(
         &self,
         reduced_proof: SP1ReduceProof<InnerSC>,
@@ -287,9 +307,7 @@ impl SP1CudaProver {
 
     /// Executes the [sp1_prover::SP1Prover::wrap_bn254] method inside the container.
     ///
-    /// You will need at least 40GB of VRAM to run this method.
-    ///
-    /// **WARNING**: This is an experimental feature and may not work as expected.
+    /// You will need at least 24GB of VRAM to run this method.
     pub fn wrap_bn254(
         &self,
         reduced_proof: SP1ReduceProof<InnerSC>,
@@ -363,75 +381,77 @@ impl Middleware for LoggingMiddleware {
     }
 }
 
-#[cfg(feature = "protobuf")]
-#[cfg(test)]
-mod tests {
-    use sp1_core_machine::{reduce::SP1ReduceProof, utils::setup_logger};
-    use sp1_prover::{components::DefaultProverComponents, InnerSC, SP1CoreProof, SP1Prover};
-    use test_artifacts::FIBONACCI_ELF;
-    use twirp::{url::Url, Client};
-
-    use crate::{
-        proto::api::ProverServiceClient, CompressRequestPayload, ProveCoreRequestPayload,
-        SP1CudaProver, SP1Stdin,
-    };
-
-    #[test]
-    fn test_client() {
-        setup_logger();
-
-        let prover = SP1Prover::<DefaultProverComponents>::new();
-        let client = SP1CudaProver::new().expect("Failed to create SP1CudaProver");
-        let (pk, vk) = prover.setup(FIBONACCI_ELF);
-
-        println!("proving core");
-        let proof = client.prove_core(&pk, &SP1Stdin::new()).unwrap();
-
-        println!("verifying core");
-        prover.verify(&proof.proof, &vk).unwrap();
-
-        println!("proving compress");
-        let proof = client.compress(&vk, proof, vec![]).unwrap();
-
-        println!("verifying compress");
-        prover.verify_compressed(&proof, &vk).unwrap();
-
-        println!("proving shrink");
-        let proof = client.shrink(proof).unwrap();
-
-        println!("verifying shrink");
-        prover.verify_shrink(&proof, &vk).unwrap();
-
-        println!("proving wrap_bn254");
-        let proof = client.wrap_bn254(proof).unwrap();
-
-        println!("verifying wrap_bn254");
-        prover.verify_wrap_bn254(&proof, &vk).unwrap();
-    }
-
-    #[tokio::test]
-    async fn test_prove_core() {
-        let client =
-            Client::from_base_url(Url::parse("http://localhost:3000/twirp/").unwrap()).unwrap();
-
-        let prover = SP1Prover::<DefaultProverComponents>::new();
-        let (pk, vk) = prover.setup(FIBONACCI_ELF);
-        let payload = ProveCoreRequestPayload { pk, stdin: SP1Stdin::new() };
-        let request =
-            crate::proto::api::ProveCoreRequest { data: bincode::serialize(&payload).unwrap() };
-        let proof = client.prove_core(request).await.unwrap();
-        let proof: SP1CoreProof = bincode::deserialize(&proof.result).unwrap();
-        prover.verify(&proof.proof, &vk).unwrap();
-
-        tracing::info!("compress");
-        let payload = CompressRequestPayload { vk: vk.clone(), proof, deferred_proofs: vec![] };
-        let request =
-            crate::proto::api::CompressRequest { data: bincode::serialize(&payload).unwrap() };
-        let compressed_proof = client.compress(request).await.unwrap();
-        let compressed_proof: SP1ReduceProof<InnerSC> =
-            bincode::deserialize(&compressed_proof.result).unwrap();
-
-        tracing::info!("verify compressed");
-        prover.verify_compressed(&compressed_proof, &vk).unwrap();
-    }
-}
+// #[cfg(feature = "protobuf")]
+// #[cfg(test)]
+// mod tests {
+//     use sp1_core_machine::{
+//         reduce::SP1ReduceProof,
+//         utils::{setup_logger, tests::FIBONACCI_ELF},
+//     };
+//     use sp1_prover::{components::DefaultProverComponents, InnerSC, SP1CoreProof, SP1Prover};
+//     use twirp::{url::Url, Client};
+
+//     use crate::{
+//         proto::api::ProverServiceClient, CompressRequestPayload, ProveCoreRequestPayload,
+//         SP1CudaProver, SP1Stdin,
+//     };
+
+//     #[test]
+//     fn test_client() {
+//         setup_logger();
+
+//         let prover = SP1Prover::<DefaultProverComponents>::new();
+//         let client = SP1CudaProver::new().expect("Failed to create SP1CudaProver");
+//         let (pk, vk) = prover.setup(FIBONACCI_ELF);
+
+//         println!("proving core");
+//         let proof = client.prove_core(&pk, &SP1Stdin::new()).unwrap();
+
+//         println!("verifying core");
+//         prover.verify(&proof.proof, &vk).unwrap();
+
+//         println!("proving compress");
+//         let proof = client.compress(&vk, proof, vec![]).unwrap();
+
+//         println!("verifying compress");
+//         prover.verify_compressed(&proof, &vk).unwrap();
+
+//         println!("proving shrink");
+//         let proof = client.shrink(proof).unwrap();
+
+//         println!("verifying shrink");
+//         prover.verify_shrink(&proof, &vk).unwrap();
+
+//         println!("proving wrap_bn254");
+//         let proof = client.wrap_bn254(proof).unwrap();
+
+//         println!("verifying wrap_bn254");
+//         prover.verify_wrap_bn254(&proof, &vk).unwrap();
+//     }
+
+//     #[tokio::test]
+//     async fn test_prove_core() {
+//         let client =
+//             Client::from_base_url(Url::parse("http://localhost:3000/twirp/").unwrap()).unwrap();
+
+//         let prover = SP1Prover::<DefaultProverComponents>::new();
+//         let (pk, vk) = prover.setup(FIBONACCI_ELF);
+//         let payload = ProveCoreRequestPayload { pk, stdin: SP1Stdin::new() };
+//         let request =
+//             crate::proto::api::ProveCoreRequest { data: bincode::serialize(&payload).unwrap() };
+//         let proof = client.prove_core(request).await.unwrap();
+//         let proof: SP1CoreProof = bincode::deserialize(&proof.result).unwrap();
+//         prover.verify(&proof.proof, &vk).unwrap();
+
+//         tracing::info!("compress");
+//         let payload = CompressRequestPayload { vk: vk.clone(), proof, deferred_proofs: vec![] };
+//         let request =
+//             crate::proto::api::CompressRequest { data: bincode::serialize(&payload).unwrap() };
+//         let compressed_proof = client.compress(request).await.unwrap();
+//         let compressed_proof: SP1ReduceProof<InnerSC> =
+//             bincode::deserialize(&compressed_proof.result).unwrap();
+
+//         tracing::info!("verify compressed");
+//         prover.verify_compressed(&compressed_proof, &vk).unwrap();
+//     }
+// }
diff --git a/crates/cuda/src/proto/api.rs b/crates/cuda/src/proto/api.rs
index 54aea9d77d..840055f181 100644
--- a/crates/cuda/src/proto/api.rs
+++ b/crates/cuda/src/proto/api.rs
@@ -10,6 +10,18 @@ pub struct ReadyResponse {
 }
 #[derive(serde::Serialize, serde::Deserialize)]
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct SetupRequest {
+    #[prost(bytes = "vec", tag = "1")]
+    pub data: ::prost::alloc::vec::Vec<u8>,
+}
+#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct SetupResponse {
+    #[prost(bytes = "vec", tag = "1")]
+    pub result: ::prost::alloc::vec::Vec<u8>,
+}
+#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ProveCoreRequest {
     #[prost(bytes = "vec", tag = "1")]
     pub data: ::prost::alloc::vec::Vec<u8>,
@@ -60,6 +72,11 @@ pub use twirp;
 pub const SERVICE_FQN: &str = "/api.ProverService";
 #[twirp::async_trait::async_trait]
 pub trait ProverService {
+    async fn setup(
+        &self,
+        ctx: twirp::Context,
+        req: SetupRequest,
+    ) -> Result<SetupResponse, twirp::TwirpErrorResponse>;
     async fn ready(
         &self,
         ctx: twirp::Context,
@@ -91,6 +108,13 @@ impl<T> ProverService for std::sync::Arc<T>
 where
     T: ProverService + Sync + Send,
 {
+    async fn setup(
+        &self,
+        ctx: twirp::Context,
+        req: SetupRequest,
+    ) -> Result<SetupResponse, twirp::TwirpErrorResponse> {
+        T::setup(&*self, ctx, req).await
+    }
     async fn ready(
         &self,
         ctx: twirp::Context,
@@ -132,6 +156,12 @@ where
     T: ProverService + Clone + Send + Sync + 'static,
 {
     twirp::details::TwirpRouterBuilder::new(api)
+        .route(
+            "/Setup",
+            |api: T, ctx: twirp::Context, req: SetupRequest| async move {
+                api.setup(ctx, req).await
+            },
+        )
         .route(
             "/Ready",
             |api: T, ctx: twirp::Context, req: ReadyRequest| async move {
@@ -166,6 +196,10 @@ where
 }
 #[twirp::async_trait::async_trait]
 pub trait ProverServiceClient: Send + Sync + std::fmt::Debug {
+    async fn setup(
+        &self,
+        req: SetupRequest,
+    ) -> Result<SetupResponse, twirp::ClientError>;
     async fn ready(
         &self,
         req: ReadyRequest,
@@ -186,6 +220,12 @@ pub trait ProverServiceClient: Send + Sync + std::fmt::Debug {
 }
 #[twirp::async_trait::async_trait]
 impl ProverServiceClient for twirp::client::Client {
+    async fn setup(
+        &self,
+        req: SetupRequest,
+    ) -> Result<SetupResponse, twirp::ClientError> {
+        self.request("api.ProverService/Setup", req).await
+    }
     async fn ready(
         &self,
         req: ReadyRequest,
diff --git a/crates/curves/Cargo.toml b/crates/curves/Cargo.toml
index c656dbd02b..268260c23e 100644
--- a/crates/curves/Cargo.toml
+++ b/crates/curves/Cargo.toml
@@ -13,13 +13,13 @@ categories = { workspace = true }
 num = "0.4.3"
 serde = { workspace = true, features = ["derive"] }
 typenum = "1.17.0"
-curve25519-dalek = { version = "4.1.2" }
 k256 = { version = "0.13.3", features = ["expose-field"] }
 p256 = { version = "0.13.2", features = ["expose-field"] }
 generic-array = { version = "1.1.0", features = ["alloc", "serde"] }
 amcl = { package = "snowbridge-amcl", version = "1.0.2", default-features = false, features = [
   "bls381",
 ] }
+ecdsa = "0.16.9"
 elliptic-curve = "0.13.8"
 dashu = "0.4.2"
 
diff --git a/crates/curves/src/edwards/ed25519.rs b/crates/curves/src/edwards/ed25519.rs
index c2a823c603..4d1be91ac0 100644
--- a/crates/curves/src/edwards/ed25519.rs
+++ b/crates/curves/src/edwards/ed25519.rs
@@ -1,6 +1,6 @@
 use std::str::FromStr;
 
-use curve25519_dalek::edwards::CompressedEdwardsY;
+use crate::curve25519_dalek::CompressedEdwardsY;
 use generic_array::GenericArray;
 use num::{BigUint, Num, One};
 use serde::{Deserialize, Serialize};
@@ -72,7 +72,7 @@ impl EdwardsParameters for Ed25519Parameters {
 ///
 /// This function always returns the nonnegative square root, in the sense that the least
 /// significant bit of the result is always 0.
-pub fn ed25519_sqrt(a: &BigUint) -> BigUint {
+pub fn ed25519_sqrt(a: &BigUint) -> Option<BigUint> {
     // Here is a description of how to calculate sqrt in the Curve25519 base field:
     // ssh://git@github.com/succinctlabs/curve25519-dalek/blob/
     // e2d1bd10d6d772af07cac5c8161cd7655016af6d/curve25519-dalek/src/field.rs#L256
@@ -108,7 +108,7 @@ pub fn ed25519_sqrt(a: &BigUint) -> BigUint {
     let flipped_sign_sqrt = beta_squared == neg_a;
 
     if !correct_sign_sqrt && !flipped_sign_sqrt {
-        panic!("a is not a square");
+        return None;
     }
 
     let beta_bytes = beta.to_bytes_le();
@@ -116,10 +116,10 @@ pub fn ed25519_sqrt(a: &BigUint) -> BigUint {
         beta = (&modulus - &beta) % &modulus;
     }
 
-    beta
+    Some(beta)
 }
 
-pub fn decompress(compressed_point: &CompressedEdwardsY) -> AffinePoint<Ed25519> {
+pub fn decompress(compressed_point: &CompressedEdwardsY) -> Option<AffinePoint<Ed25519>> {
     let mut point_bytes = *compressed_point.as_bytes();
     let sign = point_bytes[31] >> 7 == 1;
     // mask out the sign bit
@@ -134,7 +134,7 @@ pub fn decompress(compressed_point: &CompressedEdwardsY) -> AffinePoint<Ed25519>
     let v_inv = v.modpow(&(modulus - BigUint::from(2u64)), modulus);
     let u_div_v = (u * &v_inv) % modulus;
 
-    let mut x = ed25519_sqrt(&u_div_v);
+    let mut x = ed25519_sqrt(&u_div_v)?;
 
     // sqrt always returns the nonnegative square root,
     // so we negate according to the supplied sign bit.
@@ -142,7 +142,7 @@ pub fn decompress(compressed_point: &CompressedEdwardsY) -> AffinePoint<Ed25519>
         x = modulus - &x;
     }
 
-    AffinePoint::new(x, y.clone())
+    Some(AffinePoint::new(x, y.clone()))
 }
 
 #[cfg(test)]
@@ -178,7 +178,7 @@ mod tests {
 
                 CompressedEdwardsY(compressed)
             };
-            assert_eq!(point, decompress(&compressed_point));
+            assert_eq!(point, decompress(&compressed_point).unwrap());
 
             // Double the point to create a "random" point for the next iteration.
             point = point.clone() + point.clone();
diff --git a/crates/curves/src/lib.rs b/crates/curves/src/lib.rs
index c3f66d9eac..73682a74d6 100644
--- a/crates/curves/src/lib.rs
+++ b/crates/curves/src/lib.rs
@@ -7,7 +7,39 @@ pub mod utils;
 pub mod weierstrass;
 
 pub mod curve25519_dalek {
-    pub use curve25519_dalek::edwards::CompressedEdwardsY;
+    /// In "Edwards y" / "Ed25519" format, the curve point \\((x,y)\\) is
+    /// determined by the \\(y\\)-coordinate and the sign of \\(x\\).
+    ///
+    /// The first 255 bits of a `CompressedEdwardsY` represent the
+    /// \\(y\\)-coordinate.  The high bit of the 32nd byte gives the sign of \\(x\\).
+    ///
+    /// Note: This is taken from the `curve25519-dalek` crate.
+    #[derive(Copy, Clone, Eq, PartialEq, Hash)]
+    pub struct CompressedEdwardsY(pub [u8; 32]);
+
+    impl CompressedEdwardsY {
+        /// View this `CompressedEdwardsY` as a byte array.
+        pub fn as_bytes(&self) -> &[u8; 32] {
+            &self.0
+        }
+
+        /// Consume this `CompressedEdwardsY` and return the underlying byte array.
+        pub fn to_bytes(&self) -> [u8; 32] {
+            self.0
+        }
+
+        /// Construct a `CompressedEdwardsY` from a slice of bytes.
+        ///
+        /// # Errors
+        ///
+        /// Returns [`TryFromSliceError`] if the input `bytes` slice does not have
+        /// a length of 32.
+        pub fn from_slice(
+            bytes: &[u8],
+        ) -> Result<CompressedEdwardsY, core::array::TryFromSliceError> {
+            bytes.try_into().map(CompressedEdwardsY)
+        }
+    }
 }
 
 pub mod k256 {
@@ -24,6 +56,10 @@ pub mod p256 {
     };
 }
 
+pub mod ecdsa {
+    pub use ecdsa::RecoveryId;
+}
+
 use params::{FieldParameters, NumWords};
 use sp1_primitives::consts::WORD_SIZE;
 use std::{
diff --git a/crates/eval/src/lib.rs b/crates/eval/src/lib.rs
index 06f85a0a68..7032d2f58a 100644
--- a/crates/eval/src/lib.rs
+++ b/crates/eval/src/lib.rs
@@ -3,8 +3,10 @@ use clap::{command, Parser};
 use reqwest::Client;
 use serde::Serialize;
 use serde_json::json;
-use slack_rust::chat::post_message::{post_message, PostMessageRequest};
-use slack_rust::http_client::default_client;
+use slack_rust::{
+    chat::post_message::{post_message, PostMessageRequest},
+    http_client::default_client,
+};
 use sp1_prover::{components::SP1ProverComponents, utils::get_cycles, SP1Prover};
 use sp1_sdk::{SP1Context, SP1Stdin};
 use sp1_stark::SP1ProverOpts;
@@ -19,7 +21,8 @@ mod program;
 #[derive(Parser, Clone)]
 #[command(about = "Evaluate the performance of SP1 on programs.")]
 struct EvalArgs {
-    /// The programs to evaluate, specified by name. If not specified, all programs will be evaluated.
+    /// The programs to evaluate, specified by name. If not specified, all programs will be
+    /// evaluated.
     #[arg(long, use_value_delimiter = true, value_delimiter = ',')]
     pub programs: Vec<String>,
 
@@ -169,14 +172,14 @@ fn run_evaluation<C: SP1ProverComponents>(
     let cycles = get_cycles(elf, stdin);
 
     let prover = SP1Prover::<C>::new();
-    let (pk, vk) = prover.setup(elf);
+    let (_, pk_d, program, vk) = prover.setup(elf);
 
     let context = SP1Context::default();
 
     let (_, exec_duration) = time_operation(|| prover.execute(elf, stdin, context.clone()));
 
     let (core_proof, core_duration) =
-        time_operation(|| prover.prove_core(&pk, stdin, opts, context).unwrap());
+        time_operation(|| prover.prove_core(&pk_d, program, stdin, opts, context).unwrap());
 
     let (_, compress_duration) =
         time_operation(|| prover.compress(&vk, core_proof, vec![], opts).unwrap());
diff --git a/crates/perf/run_s3.sh b/crates/perf/run_s3.sh
index a9a0ef7a06..901cde6574 100755
--- a/crates/perf/run_s3.sh
+++ b/crates/perf/run_s3.sh
@@ -1,13 +1,21 @@
 #!/bin/bash
 
-# Check if both arguments are provided
-if [ $# -ne 2 ]; then
-    echo "Usage: $0 <s3_path> <cpu|cuda>"
+# Set the default value for the stage argument
+stage="prove"
+
+# Check the number of arguments
+if [ $# -lt 2 ] || [ $# -gt 3 ]; then
+    echo "Usage: $0 <s3_path> <cpu|cuda> [execute|prove]"
     exit 1
 fi
 
+# If the third argument is provided, override the default value
+if [ $# -eq 3 ]; then
+    stage="$3"
+fi
+
 s3_path=$1
-stage=$2
+kind=$2
 
 # Download files from S3
 aws s3 cp s3://sp1-testing-suite/$s3_path/program.bin /tmp/program.bin
@@ -20,4 +28,4 @@ export RUST_LOG=debug
 export SP1_DEBUG=1
 
 # Run moongate-perf
-cargo run -p sp1-perf -- --program /tmp/program.bin --stdin /tmp/stdin.bin --mode $stage
\ No newline at end of file
+cargo run -p sp1-perf -- --program /tmp/program.bin --stdin /tmp/stdin.bin --mode $kind --stage $stage
\ No newline at end of file
diff --git a/crates/perf/src/main.rs b/crates/perf/src/main.rs
index 31a34c52ed..e493ce2c01 100644
--- a/crates/perf/src/main.rs
+++ b/crates/perf/src/main.rs
@@ -3,6 +3,7 @@ use std::{
     time::{Duration, Instant},
 };
 
+use clap::ValueEnum;
 use clap::{command, Parser};
 use sp1_cuda::SP1CudaProver;
 use sp1_prover::HashableKey;
@@ -20,6 +21,8 @@ struct PerfArgs {
     pub stdin: String,
     #[arg(short, long)]
     pub mode: ProverMode,
+    #[arg(short, long, default_value = "prove")]
+    pub stage: Stage,
 }
 
 #[derive(Default, Debug, Clone)]
@@ -37,6 +40,12 @@ struct PerfResult {
     pub verify_wrap_duration: Duration,
 }
 
+#[derive(Debug, Clone, ValueEnum, PartialEq, Eq)]
+enum Stage {
+    Execute,
+    Prove,
+}
+
 pub fn time_operation<T, F: FnOnce() -> T>(operation: F) -> (T, std::time::Duration) {
     let start = Instant::now();
     let result = operation();
@@ -53,8 +62,13 @@ fn main() {
     let stdin: SP1Stdin = bincode::deserialize(&stdin).expect("failed to deserialize stdin");
 
     let prover = SP1Prover::<DefaultProverComponents>::new();
-    let (pk, vk) = prover.setup(&elf);
+    let (pk, pk_d, program, vk) = prover.setup(&elf);
     let cycles = sp1_prover::utils::get_cycles(&elf, &stdin);
+    let stage = args.stage;
+    if stage == Stage::Execute {
+        println!("Program executed successfully, number of cycles: {}", cycles);
+        return;
+    }
     let opts = SP1ProverOpts::default();
 
     match args.mode {
@@ -63,8 +77,9 @@ fn main() {
             let (_, execution_duration) =
                 time_operation(|| prover.execute(&elf, &stdin, context.clone()));
 
-            let (core_proof, prove_core_duration) =
-                time_operation(|| prover.prove_core(&pk, &stdin, opts, context).unwrap());
+            let (core_proof, prove_core_duration) = time_operation(|| {
+                prover.prove_core(&pk_d, program, &stdin, opts, context).unwrap()
+            });
 
             let (_, verify_core_duration) =
                 time_operation(|| prover.verify(&core_proof.proof, &vk));
@@ -89,7 +104,8 @@ fn main() {
                 time_operation(|| prover.verify_wrap_bn254(&wrapped_bn254_proof, &vk));
 
             // Generate a proof that verifies two deferred proofs from the proof above.
-            let (pk_verify_proof, vk_verify_proof) = prover.setup(VERIFY_PROOF_ELF);
+            let (_, pk_verify_proof_d, pk_verify_program, vk_verify_proof) =
+                prover.setup(VERIFY_PROOF_ELF);
             let pv = core_proof.public_values.to_vec();
 
             let mut stdin = SP1Stdin::new();
@@ -101,7 +117,9 @@ fn main() {
 
             let context = SP1Context::default();
             let (core_proof, _) = time_operation(|| {
-                prover.prove_core(&pk_verify_proof, &stdin, opts, context).unwrap()
+                prover
+                    .prove_core(&pk_verify_proof_d, pk_verify_program, &stdin, opts, context)
+                    .unwrap()
             });
             let deferred_proofs =
                 stdin.proofs.into_iter().map(|(proof, _)| proof).collect::<Vec<_>>();
@@ -134,8 +152,10 @@ fn main() {
             let (_, execution_duration) =
                 time_operation(|| prover.execute(&elf, &stdin, context.clone()));
 
+            let (_, _) = time_operation(|| server.setup(&elf).unwrap());
+
             let (core_proof, prove_core_duration) =
-                time_operation(|| server.prove_core(&pk, &stdin).unwrap());
+                time_operation(|| server.prove_core(&stdin).unwrap());
 
             let (_, verify_core_duration) = time_operation(|| {
                 prover.verify(&core_proof.proof, &vk).expect("Proof verification failed")
diff --git a/crates/primitives/src/consts.rs b/crates/primitives/src/consts.rs
index 396905274b..0e497bd01e 100644
--- a/crates/primitives/src/consts.rs
+++ b/crates/primitives/src/consts.rs
@@ -1,9 +1,15 @@
 /// The maximum size of the memory in bytes.
 pub const MAXIMUM_MEMORY_SIZE: u32 = u32::MAX;
 
+/// The number of bits in a byte.
+pub const BYTE_SIZE: usize = 8;
+
 /// The size of a word in bytes.
 pub const WORD_SIZE: usize = 4;
 
+/// The number of bytes necessary to represent a 64-bit integer.
+pub const LONG_WORD_SIZE: usize = 2 * WORD_SIZE;
+
 /// Converts a slice of words to a byte vector in little endian.
 pub fn words_to_bytes_le_vec(words: &[u32]) -> Vec<u8> {
     words.iter().flat_map(|word| word.to_le_bytes().to_vec()).collect::<Vec<_>>()
diff --git a/crates/prover/Cargo.toml b/crates/prover/Cargo.toml
index 226b902c60..db60abac23 100644
--- a/crates/prover/Cargo.toml
+++ b/crates/prover/Cargo.toml
@@ -62,8 +62,8 @@ name = "post_trusted_setup"
 path = "scripts/post_trusted_setup.rs"
 
 [[bin]]
-name = "e2e"
-path = "scripts/e2e.rs"
+name = "find_minimal_large_recursion_shape"
+path = "scripts/find_minimal_large_recursion_shape.rs"
 
 [[bin]]
 name = "fibonacci_groth16"
diff --git a/crates/prover/scripts/e2e.rs b/crates/prover/scripts/e2e.rs
deleted file mode 100644
index 535e846a9e..0000000000
--- a/crates/prover/scripts/e2e.rs
+++ /dev/null
@@ -1,119 +0,0 @@
-// use std::{borrow::Borrow, path::PathBuf};
-
-// use clap::Parser;
-// use p3_baby_bear::BabyBear;
-// use p3_field::PrimeField;
-// use sp1_core_executor::SP1Context;
-// use sp1_core_machine::io::SP1Stdin;
-// use sp1_prover::{
-//     utils::{babybear_bytes_to_bn254, babybears_to_bn254, words_to_bytes},
-//     SP1Prover,
-// };
-// use sp1_recursion_circuit::{stark::build_wrap_circuit, witness::Witnessable};
-// use sp1_recursion_compiler::ir::Witness;
-// use sp1_recursion_core::air::RecursionPublicValues;
-// use sp1_recursion_gnark_ffi::{Groth16Bn254Prover, PlonkBn254Prover};
-// use sp1_stark::SP1ProverOpts;
-// use subtle_encoding::hex;
-
-// #[derive(Parser, Debug)]
-// #[clap(author, version, about, long_about = None)]
-// struct Args {
-//     #[clap(short, long)]
-//     build_dir: String,
-//     #[arg(short, long)]
-//     system: String,
-// }
-
-// pub fn main() {
-//     sp1_core_machine::utils::setup_logger();
-//     std::env::set_var("RECONSTRUCT_COMMITMENTS", "false");
-
-//     let args = Args::parse();
-//     let build_dir: PathBuf = args.build_dir.into();
-
-//     let elf = include_bytes!("../elf/riscv32im-succinct-zkvm-elf");
-
-//     tracing::info!("initializing prover");
-//     let prover: SP1Prover = SP1Prover::new();
-//     let opts = SP1ProverOpts::default();
-//     let context = SP1Context::default();
-
-//     tracing::info!("setup elf");
-//     let (pk, vk) = prover.setup(elf);
-
-//     tracing::info!("prove core");
-//     let stdin = SP1Stdin::new();
-//     let core_proof = prover.prove_core(&pk, &stdin, opts, context).unwrap();
-
-//     tracing::info!("Compress");
-//     let reduced_proof = prover.compress(&vk, core_proof, vec![], opts).unwrap();
-
-//     tracing::info!("Shrink");
-//     let compressed_proof = prover.shrink(reduced_proof, opts).unwrap();
-
-//     tracing::info!("wrap");
-//     let wrapped_proof = prover.wrap_bn254(compressed_proof, opts).unwrap();
-
-//     tracing::info!("building verifier constraints");
-//     let constraints = tracing::info_span!("wrap circuit")
-//         .in_scope(|| build_wrap_circuit(prover.wrap_vk(), wrapped_proof.proof.clone()));
-
-//     tracing::info!("building template witness");
-//     let pv: &RecursionPublicValues<_> = wrapped_proof.proof.public_values.as_slice().borrow();
-//     let vkey_hash = babybears_to_bn254(&pv.sp1_vk_digest);
-//     let committed_values_digest_bytes: [BabyBear; 32] =
-//         words_to_bytes(&pv.committed_value_digest).try_into().unwrap();
-//     let committed_values_digest = babybear_bytes_to_bn254(&committed_values_digest_bytes);
-
-//     let mut witness = Witness::default();
-//     wrapped_proof.proof.write(&mut witness);
-//     witness.write_committed_values_digest(committed_values_digest);
-//     witness.write_vkey_hash(vkey_hash);
-
-//     tracing::info!("sanity check plonk test");
-//     PlonkBn254Prover::test(constraints.clone(), witness.clone());
-
-//     tracing::info!("sanity check plonk build");
-//     PlonkBn254Prover::build(constraints.clone(), witness.clone(), build_dir.clone());
-
-//     tracing::info!("sanity check plonk prove");
-//     let plonk_bn254_prover = PlonkBn254Prover::new();
-
-//     tracing::info!("plonk prove");
-//     let proof = plonk_bn254_prover.prove(witness.clone(), build_dir.clone());
-
-//     tracing::info!("verify plonk proof");
-//     plonk_bn254_prover.verify(
-//         &proof,
-//         &vkey_hash.as_canonical_biguint(),
-//         &committed_values_digest.as_canonical_biguint(),
-//         &build_dir,
-//     );
-
-//     println!("plonk proof: {:?}", String::from_utf8(hex::encode(proof.encoded_proof)).unwrap());
-
-//     tracing::info!("sanity check groth16 test");
-//     Groth16Bn254Prover::test(constraints.clone(), witness.clone());
-
-//     tracing::info!("sanity check groth16 build");
-//     Groth16Bn254Prover::build(constraints.clone(), witness.clone(), build_dir.clone());
-
-//     tracing::info!("sanity check groth16 prove");
-//     let groth16_bn254_prover = Groth16Bn254Prover::new();
-
-//     tracing::info!("groth16 prove");
-//     let proof = groth16_bn254_prover.prove(witness.clone(), build_dir.clone());
-
-//     tracing::info!("verify groth16 proof");
-//     groth16_bn254_prover.verify(
-//         &proof,
-//         &vkey_hash.as_canonical_biguint(),
-//         &committed_values_digest.as_canonical_biguint(),
-//         &build_dir,
-//     );
-
-//     println!("groth16 proof: {:?}",
-// String::from_utf8(hex::encode(proof.encoded_proof)).unwrap()); }
-
-pub fn main() {}
diff --git a/crates/prover/scripts/fibonacci_groth16.rs b/crates/prover/scripts/fibonacci_groth16.rs
index 4ca51b00af..604e25c6a3 100644
--- a/crates/prover/scripts/fibonacci_groth16.rs
+++ b/crates/prover/scripts/fibonacci_groth16.rs
@@ -1,78 +1,80 @@
-//! Tests end-to-end performance of wrapping a recursion proof to PLONK.
+fn main() {}
 
-use std::time::Instant;
+// //! Tests end-to-end performance of wrapping a recursion proof to PLONK.
 
-use itertools::iproduct;
-use sp1_core_executor::SP1Context;
-use sp1_core_machine::io::SP1Stdin;
-use sp1_prover::components::DefaultProverComponents;
-use sp1_prover::SP1Prover;
-use sp1_stark::SP1ProverOpts;
-use tracing_subscriber::fmt::format::FmtSpan;
-use tracing_subscriber::util::SubscriberInitExt;
-use tracing_subscriber::EnvFilter;
+// use std::time::Instant;
 
-fn main() {
-    // Setup tracer.
-    let default_filter = "off";
-    let log_appender = tracing_appender::rolling::never("scripts/results", "fibonacci_groth16.log");
-    let env_filter = EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| EnvFilter::new(default_filter))
-        .add_directive("p3_keccak_air=off".parse().unwrap())
-        .add_directive("p3_fri=off".parse().unwrap())
-        .add_directive("p3_challenger=off".parse().unwrap())
-        .add_directive("p3_dft=off".parse().unwrap())
-        .add_directive("sp1_core=off".parse().unwrap());
-    tracing_subscriber::fmt::Subscriber::builder()
-        .with_ansi(false)
-        .with_file(false)
-        .with_target(false)
-        .with_thread_names(false)
-        .with_env_filter(env_filter)
-        .with_span_events(FmtSpan::CLOSE)
-        .with_writer(log_appender)
-        .finish()
-        .init();
+// use itertools::iproduct;
+// use sp1_core_executor::SP1Context;
+// use sp1_core_machine::io::SP1Stdin;
+// use sp1_prover::components::DefaultProverComponents;
+// use sp1_prover::SP1Prover;
+// use sp1_stark::SP1ProverOpts;
+// use tracing_subscriber::fmt::format::FmtSpan;
+// use tracing_subscriber::util::SubscriberInitExt;
+// use tracing_subscriber::EnvFilter;
 
-    // Setup environment variables.
-    std::env::set_var("RECONSTRUCT_COMMITMENTS", "false");
+// fn main() {
+//     // Setup tracer.
+//     let default_filter = "off";
+//     let log_appender = tracing_appender::rolling::never("scripts/results", "fibonacci_groth16.log");
+//     let env_filter = EnvFilter::try_from_default_env()
+//         .unwrap_or_else(|_| EnvFilter::new(default_filter))
+//         .add_directive("p3_keccak_air=off".parse().unwrap())
+//         .add_directive("p3_fri=off".parse().unwrap())
+//         .add_directive("p3_challenger=off".parse().unwrap())
+//         .add_directive("p3_dft=off".parse().unwrap())
+//         .add_directive("sp1_core=off".parse().unwrap());
+//     tracing_subscriber::fmt::Subscriber::builder()
+//         .with_ansi(false)
+//         .with_file(false)
+//         .with_target(false)
+//         .with_thread_names(false)
+//         .with_env_filter(env_filter)
+//         .with_span_events(FmtSpan::CLOSE)
+//         .with_writer(log_appender)
+//         .finish()
+//         .init();
 
-    // Initialize prover.
-    let prover = SP1Prover::<DefaultProverComponents>::new();
+//     // Setup environment variables.
+//     std::env::set_var("RECONSTRUCT_COMMITMENTS", "false");
 
-    // Setup sweep.
-    let iterations = [480000u32];
-    let shard_sizes = [1 << 22];
-    let batch_sizes = [2];
-    let elf = test_artifacts::FIBONACCI_ELF;
-    let (pk, vk) = prover.setup(elf);
+//     // Initialize prover.
+//     let prover = SP1Prover::<DefaultProverComponents>::new();
 
-    for (shard_size, iterations, batch_size) in iproduct!(shard_sizes, iterations, batch_sizes) {
-        tracing::info!(
-            "running: shard_size={}, iterations={}, batch_size={}",
-            shard_size,
-            iterations,
-            batch_size
-        );
-        std::env::set_var("SHARD_SIZE", shard_size.to_string());
+//     // Setup sweep.
+//     let iterations = [480000u32];
+//     let shard_sizes = [1 << 22];
+//     let batch_sizes = [2];
+//     let elf = test_artifacts::FIBONACCI_ELF;
+//     let (pk, vk, _, _) = prover.setup(elf);
 
-        tracing::info!("proving leaves");
-        let stdin = SP1Stdin {
-            buffer: vec![bincode::serialize::<u32>(&iterations).unwrap()],
-            ptr: 0,
-            proofs: vec![],
-        };
-        let leaf_proving_start = Instant::now();
-        let proof = prover
-            .prove_core(&pk, &stdin, SP1ProverOpts::default(), SP1Context::default())
-            .unwrap();
-        let leaf_proving_duration = leaf_proving_start.elapsed().as_secs_f64();
-        tracing::info!("leaf_proving_duration={}", leaf_proving_duration);
+//     for (shard_size, iterations, batch_size) in iproduct!(shard_sizes, iterations, batch_sizes) {
+//         tracing::info!(
+//             "running: shard_size={}, iterations={}, batch_size={}",
+//             shard_size,
+//             iterations,
+//             batch_size
+//         );
+//         std::env::set_var("SHARD_SIZE", shard_size.to_string());
 
-        tracing::info!("proving inner");
-        let recursion_proving_start = Instant::now();
-        let _ = prover.compress(&vk, proof, vec![], SP1ProverOpts::default());
-        let recursion_proving_duration = recursion_proving_start.elapsed().as_secs_f64();
-        tracing::info!("recursion_proving_duration={}", recursion_proving_duration);
-    }
-}
+//         tracing::info!("proving leaves");
+//         let stdin = SP1Stdin {
+//             buffer: vec![bincode::serialize::<u32>(&iterations).unwrap()],
+//             ptr: 0,
+//             proofs: vec![],
+//         };
+//         let leaf_proving_start = Instant::now();
+//         let proof = prover
+//             .prove_core(&pk, &stdin, SP1ProverOpts::default(), SP1Context::default())
+//             .unwrap();
+//         let leaf_proving_duration = leaf_proving_start.elapsed().as_secs_f64();
+//         tracing::info!("leaf_proving_duration={}", leaf_proving_duration);
+
+//         tracing::info!("proving inner");
+//         let recursion_proving_start = Instant::now();
+//         let _ = prover.compress(&vk, proof, vec![], SP1ProverOpts::default());
+//         let recursion_proving_duration = recursion_proving_start.elapsed().as_secs_f64();
+//         tracing::info!("recursion_proving_duration={}", recursion_proving_duration);
+//     }
+// }
diff --git a/crates/prover/scripts/fibonacci_sweep.rs b/crates/prover/scripts/fibonacci_sweep.rs
index 5321d4cf71..eedea9972b 100644
--- a/crates/prover/scripts/fibonacci_sweep.rs
+++ b/crates/prover/scripts/fibonacci_sweep.rs
@@ -1,89 +1,91 @@
-//! Sweeps end-to-end prover performance across a wide range of parameters for Fibonacci.
+fn main() {}
 
-use std::{fs::File, io::BufWriter, io::Write, time::Instant};
+// //! Sweeps end-to-end prover performance across a wide range of parameters for Fibonacci.
 
-use itertools::iproduct;
-use sp1_core_executor::SP1Context;
-use sp1_core_machine::io::SP1Stdin;
-use sp1_prover::components::DefaultProverComponents;
-use sp1_prover::SP1Prover;
-use sp1_stark::SP1ProverOpts;
-use tracing_subscriber::EnvFilter;
-use tracing_subscriber::{fmt::format::FmtSpan, util::SubscriberInitExt};
+// use std::{fs::File, io::BufWriter, io::Write, time::Instant};
 
-fn main() {
-    // Setup tracer.
-    let default_filter = "off";
-    let log_appender = tracing_appender::rolling::never("scripts/results", "fibonacci_sweep.log");
-    let env_filter = EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| EnvFilter::new(default_filter))
-        .add_directive("p3_keccak_air=off".parse().unwrap())
-        .add_directive("p3_fri=off".parse().unwrap())
-        .add_directive("p3_challenger=off".parse().unwrap())
-        .add_directive("p3_dft=off".parse().unwrap())
-        .add_directive("sp1_core=off".parse().unwrap());
-    tracing_subscriber::fmt::Subscriber::builder()
-        .with_ansi(false)
-        .with_file(false)
-        .with_target(false)
-        .with_thread_names(false)
-        .with_env_filter(env_filter)
-        .with_span_events(FmtSpan::CLOSE)
-        .with_writer(log_appender)
-        .finish()
-        .init();
+// use itertools::iproduct;
+// use sp1_core_executor::SP1Context;
+// use sp1_core_machine::io::SP1Stdin;
+// use sp1_prover::components::DefaultProverComponents;
+// use sp1_prover::SP1Prover;
+// use sp1_stark::SP1ProverOpts;
+// use tracing_subscriber::EnvFilter;
+// use tracing_subscriber::{fmt::format::FmtSpan, util::SubscriberInitExt};
 
-    // Setup environment variables.
-    std::env::set_var("RECONSTRUCT_COMMITMENTS", "false");
+// fn main() {
+//     // Setup tracer.
+//     let default_filter = "off";
+//     let log_appender = tracing_appender::rolling::never("scripts/results", "fibonacci_sweep.log");
+//     let env_filter = EnvFilter::try_from_default_env()
+//         .unwrap_or_else(|_| EnvFilter::new(default_filter))
+//         .add_directive("p3_keccak_air=off".parse().unwrap())
+//         .add_directive("p3_fri=off".parse().unwrap())
+//         .add_directive("p3_challenger=off".parse().unwrap())
+//         .add_directive("p3_dft=off".parse().unwrap())
+//         .add_directive("sp1_core=off".parse().unwrap());
+//     tracing_subscriber::fmt::Subscriber::builder()
+//         .with_ansi(false)
+//         .with_file(false)
+//         .with_target(false)
+//         .with_thread_names(false)
+//         .with_env_filter(env_filter)
+//         .with_span_events(FmtSpan::CLOSE)
+//         .with_writer(log_appender)
+//         .finish()
+//         .init();
 
-    // Initialize prover.
-    let prover = SP1Prover::<DefaultProverComponents>::new();
+//     // Setup environment variables.
+//     std::env::set_var("RECONSTRUCT_COMMITMENTS", "false");
 
-    // Setup sweep.
-    let iterations = [480000u32];
-    let shard_sizes = [1 << 19, 1 << 20, 1 << 21, 1 << 22];
-    let batch_sizes = [2, 3, 4];
-    let elf = test_artifacts::FIBONACCI_ELF;
-    let (pk, vk) = prover.setup(elf);
+//     // Initialize prover.
+//     let prover = SP1Prover::<DefaultProverComponents>::new();
 
-    let mut lines =
-        vec!["iterations,shard_size,batch_size,leaf_proving_duration,recursion_proving_duration"
-            .to_string()];
-    for (shard_size, iterations, batch_size) in iproduct!(shard_sizes, iterations, batch_sizes) {
-        tracing::info!(
-            "running: shard_size={}, iterations={}, batch_size={}",
-            shard_size,
-            iterations,
-            batch_size
-        );
-        std::env::set_var("SHARD_SIZE", shard_size.to_string());
+//     // Setup sweep.
+//     let iterations = [480000u32];
+//     let shard_sizes = [1 << 19, 1 << 20, 1 << 21, 1 << 22];
+//     let batch_sizes = [2, 3, 4];
+//     let elf = test_artifacts::FIBONACCI_ELF;
+//     let (pk, vk, program, _) = prover.setup(elf);
 
-        let stdin = SP1Stdin {
-            buffer: vec![bincode::serialize::<u32>(&iterations).unwrap()],
-            ptr: 0,
-            proofs: vec![],
-        };
-        let leaf_proving_start = Instant::now();
-        let proof = prover
-            .prove_core(&pk, &stdin, SP1ProverOpts::default(), SP1Context::default())
-            .unwrap();
-        let leaf_proving_duration = leaf_proving_start.elapsed().as_secs_f64();
+//     let mut lines =
+//         vec!["iterations,shard_size,batch_size,leaf_proving_duration,recursion_proving_duration"
+//             .to_string()];
+//     for (shard_size, iterations, batch_size) in iproduct!(shard_sizes, iterations, batch_sizes) {
+//         tracing::info!(
+//             "running: shard_size={}, iterations={}, batch_size={}",
+//             shard_size,
+//             iterations,
+//             batch_size
+//         );
+//         std::env::set_var("SHARD_SIZE", shard_size.to_string());
 
-        let recursion_proving_start = Instant::now();
-        let _ = prover.compress(&vk, proof, vec![], SP1ProverOpts::default());
-        let recursion_proving_duration = recursion_proving_start.elapsed().as_secs_f64();
+//         let stdin = SP1Stdin {
+//             buffer: vec![bincode::serialize::<u32>(&iterations).unwrap()],
+//             ptr: 0,
+//             proofs: vec![],
+//         };
+//         let leaf_proving_start = Instant::now();
+//         let proof = prover
+//             .prove_core(&pk, program, &stdin, SP1ProverOpts::default(), SP1Context::default())
+//             .unwrap();
+//         let leaf_proving_duration = leaf_proving_start.elapsed().as_secs_f64();
 
-        lines.push(format!(
-            "{},{},{},{},{}",
-            iterations, shard_size, batch_size, leaf_proving_duration, recursion_proving_duration
-        ));
-    }
+//         let recursion_proving_start = Instant::now();
+//         let _ = prover.compress(&vk, proof, vec![], SP1ProverOpts::default());
+//         let recursion_proving_duration = recursion_proving_start.elapsed().as_secs_f64();
 
-    let file = File::create("scripts/results/fibonacci_sweep.csv").unwrap();
-    let mut writer = BufWriter::new(file);
-    for line in lines.clone() {
-        writeln!(writer, "{}", line).unwrap();
-    }
+//         lines.push(format!(
+//             "{},{},{},{},{}",
+//             iterations, shard_size, batch_size, leaf_proving_duration, recursion_proving_duration
+//         ));
+//     }
 
-    println!("{:#?}", lines);
-}
+//     let file = File::create("scripts/results/fibonacci_sweep.csv").unwrap();
+//     let mut writer = BufWriter::new(file);
+//     for line in lines.clone() {
+//         writeln!(writer, "{}", line).unwrap();
+//     }
+
+//     println!("{:#?}", lines);
+// }
diff --git a/crates/prover/scripts/find_minimal_large_recursion_shape.rs b/crates/prover/scripts/find_minimal_large_recursion_shape.rs
new file mode 100644
index 0000000000..1c15f5e512
--- /dev/null
+++ b/crates/prover/scripts/find_minimal_large_recursion_shape.rs
@@ -0,0 +1,145 @@
+use std::panic::{catch_unwind, AssertUnwindSafe};
+
+use clap::Parser;
+use p3_baby_bear::BabyBear;
+use sp1_core_machine::utils::setup_logger;
+use sp1_prover::{
+    components::DefaultProverComponents,
+    shapes::{check_shapes, SP1ProofShape},
+    SP1Prover, ShrinkAir, REDUCE_BATCH_SIZE,
+};
+use sp1_recursion_core::shape::RecursionShapeConfig;
+use sp1_stark::{MachineProver, ProofShape};
+
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[clap(short, long, default_value_t = false)]
+    dummy: bool,
+    #[clap(short, long, default_value_t = REDUCE_BATCH_SIZE)]
+    reduce_batch_size: usize,
+    #[clap(short, long, default_value_t = 1)]
+    num_compiler_workers: usize,
+    #[clap(short, long, default_value_t = 1)]
+    num_setup_workers: usize,
+    #[clap(short, long)]
+    start: Option<usize>,
+    #[clap(short, long)]
+    end: Option<usize>,
+}
+
+fn main() {
+    setup_logger();
+    let args = Args::parse();
+
+    let reduce_batch_size = args.reduce_batch_size;
+    let dummy = args.dummy;
+    let num_compiler_workers = args.num_compiler_workers;
+
+    let mut prover = SP1Prover::<DefaultProverComponents>::new();
+    prover.vk_verification = !dummy;
+
+    let recursion_shape_config =
+        prover.recursion_shape_config.as_ref().expect("recursion shape config not found");
+
+    // Create the maximal shape from all of the shapes in recursion_shape_config, then add 2 to
+    // all the log-heights of that shape. This is the starting candidate for the "minimal large
+    // shape".
+    let candidate = recursion_shape_config.union_config_with_extra_room().first().unwrap().clone();
+
+    prover.recursion_shape_config = Some(RecursionShapeConfig::from_hash_map(&candidate));
+
+    // Check that this candidate is big enough for all core shapes, including those with
+    // precompiles.
+    assert!(check_shapes(reduce_batch_size, false, num_compiler_workers, &prover,));
+
+    let mut answer = candidate.clone();
+
+    // Chip-by-chip in the candidate, reduce the log-height corresponding to that chip until the
+    // shape is no longer big enough to support all the core shapes. Then, record the log height for
+    // that chip into answer.
+    for (key, value) in candidate.iter() {
+        if key != "PublicValues" {
+            let mut done = false;
+            let mut new_val = *value;
+            while !done {
+                new_val -= 1;
+                answer.insert(key.clone(), new_val);
+                prover.recursion_shape_config = Some(RecursionShapeConfig::from_hash_map(&answer));
+                done = !check_shapes(reduce_batch_size, false, num_compiler_workers, &prover);
+            }
+            answer.insert(key.clone(), new_val + 1);
+        }
+    }
+
+    let mut no_precompile_answer = answer.clone();
+
+    // Repeat the process but only for core shapes that don't have a precompile in them.
+    for (key, value) in answer.iter() {
+        if key != "PublicValues" {
+            let mut done = false;
+            let mut new_val = *value;
+            while !done {
+                new_val -= 1;
+                no_precompile_answer.insert(key.clone(), new_val);
+                prover.recursion_shape_config =
+                    Some(RecursionShapeConfig::from_hash_map(&no_precompile_answer));
+                done = !check_shapes(reduce_batch_size, true, num_compiler_workers, &prover);
+            }
+            no_precompile_answer.insert(key.clone(), new_val + 1);
+        }
+    }
+
+    // Repeat this process to tune the shrink shape.
+    let mut shrink_shape = ShrinkAir::<BabyBear>::shrink_shape().clone_into_hash_map();
+
+    // First, check that the current shrink shape is compatible with the compress shape choice
+    // arising from the tuning process above.
+    assert!({
+        prover.recursion_shape_config = Some(RecursionShapeConfig::from_hash_map(&answer));
+        catch_unwind(AssertUnwindSafe(|| {
+            prover.shrink_prover.setup(&prover.program_from_shape(
+                true,
+                sp1_prover::shapes::SP1CompressProgramShape::from_proof_shape(
+                    SP1ProofShape::Shrink(ProofShape {
+                        chip_information: answer.clone().into_iter().collect::<Vec<_>>(),
+                    }),
+                    5,
+                ),
+                Some(shrink_shape.clone().into()),
+            ))
+        }))
+        .is_ok()
+    });
+
+    // Next, tune the shrink shape in the same manner as for the compress shapes.
+    for (key, value) in shrink_shape.clone().iter() {
+        if key != "PublicValues" {
+            let mut done = false;
+            let mut new_val = *value + 1;
+            while !done {
+                new_val -= 1;
+                shrink_shape.insert(key.clone(), new_val);
+                prover.recursion_shape_config = Some(RecursionShapeConfig::from_hash_map(&answer));
+                done = catch_unwind(AssertUnwindSafe(|| {
+                    prover.shrink_prover.setup(&prover.program_from_shape(
+                        true,
+                        sp1_prover::shapes::SP1CompressProgramShape::from_proof_shape(
+                            SP1ProofShape::Shrink(ProofShape {
+                                chip_information: answer.clone().into_iter().collect::<Vec<_>>(),
+                            }),
+                            5,
+                        ),
+                        Some(shrink_shape.clone().into()),
+                    ))
+                }))
+                .is_err();
+            }
+            shrink_shape.insert(key.clone(), new_val + 1);
+        }
+    }
+
+    println!("Final compress shape: {:?}", answer);
+    println!("Final compress shape with no precompiles: {:?}", no_precompile_answer);
+    println!("Final shrink shape: {:?}", shrink_shape);
+}
diff --git a/crates/prover/scripts/tendermint_sweep.rs b/crates/prover/scripts/tendermint_sweep.rs
index bb80f848ea..1ff056d2c5 100644
--- a/crates/prover/scripts/tendermint_sweep.rs
+++ b/crates/prover/scripts/tendermint_sweep.rs
@@ -1,89 +1,91 @@
-//! Sweeps end-to-end prover performance across a wide range of parameters for Tendermint.
+fn main() {}
 
-use std::{fs::File, io::BufWriter, io::Write, time::Instant};
+// //! Sweeps end-to-end prover performance across a wide range of parameters for Tendermint.
 
-use itertools::iproduct;
-use sp1_core_executor::SP1Context;
-use sp1_core_machine::io::SP1Stdin;
-use sp1_prover::components::DefaultProverComponents;
-use sp1_prover::SP1Prover;
-use sp1_stark::SP1ProverOpts;
-use tracing_subscriber::EnvFilter;
-use tracing_subscriber::{fmt::format::FmtSpan, util::SubscriberInitExt};
+// use std::{fs::File, io::BufWriter, io::Write, time::Instant};
 
-fn main() {
-    // Setup tracer.
-    let default_filter = "off";
-    let log_appender = tracing_appender::rolling::never("scripts/results", "tendermint_sweep.log");
-    let env_filter = EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| EnvFilter::new(default_filter))
-        .add_directive("p3_keccak_air=off".parse().unwrap())
-        .add_directive("p3_fri=off".parse().unwrap())
-        .add_directive("p3_challenger=off".parse().unwrap())
-        .add_directive("p3_dft=off".parse().unwrap())
-        .add_directive("sp1_core=off".parse().unwrap());
-    tracing_subscriber::fmt::Subscriber::builder()
-        .with_ansi(false)
-        .with_file(false)
-        .with_target(false)
-        .with_thread_names(false)
-        .with_env_filter(env_filter)
-        .with_span_events(FmtSpan::CLOSE)
-        .with_writer(log_appender)
-        .finish()
-        .init();
+// use itertools::iproduct;
+// use sp1_core_executor::SP1Context;
+// use sp1_core_machine::io::SP1Stdin;
+// use sp1_prover::components::DefaultProverComponents;
+// use sp1_prover::SP1Prover;
+// use sp1_stark::SP1ProverOpts;
+// use tracing_subscriber::EnvFilter;
+// use tracing_subscriber::{fmt::format::FmtSpan, util::SubscriberInitExt};
 
-    // Setup environment variables.
-    std::env::set_var("RECONSTRUCT_COMMITMENTS", "false");
+// fn main() {
+//     // Setup tracer.
+//     let default_filter = "off";
+//     let log_appender = tracing_appender::rolling::never("scripts/results", "tendermint_sweep.log");
+//     let env_filter = EnvFilter::try_from_default_env()
+//         .unwrap_or_else(|_| EnvFilter::new(default_filter))
+//         .add_directive("p3_keccak_air=off".parse().unwrap())
+//         .add_directive("p3_fri=off".parse().unwrap())
+//         .add_directive("p3_challenger=off".parse().unwrap())
+//         .add_directive("p3_dft=off".parse().unwrap())
+//         .add_directive("sp1_core=off".parse().unwrap());
+//     tracing_subscriber::fmt::Subscriber::builder()
+//         .with_ansi(false)
+//         .with_file(false)
+//         .with_target(false)
+//         .with_thread_names(false)
+//         .with_env_filter(env_filter)
+//         .with_span_events(FmtSpan::CLOSE)
+//         .with_writer(log_appender)
+//         .finish()
+//         .init();
 
-    // Initialize prover.
-    let prover = SP1Prover::<DefaultProverComponents>::new();
+//     // Setup environment variables.
+//     std::env::set_var("RECONSTRUCT_COMMITMENTS", "false");
 
-    // Setup sweep.
-    let iterations = [480000u32];
-    let shard_sizes = [1 << 19, 1 << 20, 1 << 21, 1 << 22];
-    let batch_sizes = [2];
-    let elf = test_artifacts::TENDERMINT_BENCHMARK_ELF;
-    let (pk, vk) = prover.setup(elf);
+//     // Initialize prover.
+//     let prover = SP1Prover::<DefaultProverComponents>::new();
 
-    let mut lines =
-        vec!["iterations,shard_size,batch_size,leaf_proving_duration,recursion_proving_duration"
-            .to_string()];
-    for (shard_size, iterations, batch_size) in iproduct!(shard_sizes, iterations, batch_sizes) {
-        tracing::info!(
-            "running: shard_size={}, iterations={}, batch_size={}",
-            shard_size,
-            iterations,
-            batch_size
-        );
-        std::env::set_var("SHARD_SIZE", shard_size.to_string());
+//     // Setup sweep.
+//     let iterations = [480000u32];
+//     let shard_sizes = [1 << 19, 1 << 20, 1 << 21, 1 << 22];
+//     let batch_sizes = [2];
+//     let elf = test_artifacts::TENDERMINT_BENCHMARK_ELF;
+//     let (pk, vk, _, _) = prover.setup(elf);
 
-        let stdin = SP1Stdin {
-            buffer: vec![bincode::serialize::<u32>(&iterations).unwrap()],
-            ptr: 0,
-            proofs: vec![],
-        };
-        let leaf_proving_start = Instant::now();
-        let proof = prover
-            .prove_core(&pk, &stdin, SP1ProverOpts::default(), SP1Context::default())
-            .unwrap();
-        let leaf_proving_duration = leaf_proving_start.elapsed().as_secs_f64();
+//     let mut lines =
+//         vec!["iterations,shard_size,batch_size,leaf_proving_duration,recursion_proving_duration"
+//             .to_string()];
+//     for (shard_size, iterations, batch_size) in iproduct!(shard_sizes, iterations, batch_sizes) {
+//         tracing::info!(
+//             "running: shard_size={}, iterations={}, batch_size={}",
+//             shard_size,
+//             iterations,
+//             batch_size
+//         );
+//         std::env::set_var("SHARD_SIZE", shard_size.to_string());
 
-        let recursion_proving_start = Instant::now();
-        let _ = prover.compress(&vk, proof, vec![], SP1ProverOpts::default());
-        let recursion_proving_duration = recursion_proving_start.elapsed().as_secs_f64();
+//         let stdin = SP1Stdin {
+//             buffer: vec![bincode::serialize::<u32>(&iterations).unwrap()],
+//             ptr: 0,
+//             proofs: vec![],
+//         };
+//         let leaf_proving_start = Instant::now();
+//         let proof = prover
+//             .prove_core(&pk, &stdin, SP1ProverOpts::default(), SP1Context::default())
+//             .unwrap();
+//         let leaf_proving_duration = leaf_proving_start.elapsed().as_secs_f64();
 
-        lines.push(format!(
-            "{},{},{},{},{}",
-            iterations, shard_size, batch_size, leaf_proving_duration, recursion_proving_duration
-        ));
-    }
+//         let recursion_proving_start = Instant::now();
+//         let _ = prover.compress(&vk, proof, vec![], SP1ProverOpts::default());
+//         let recursion_proving_duration = recursion_proving_start.elapsed().as_secs_f64();
 
-    let file = File::create("scripts/results/tendermint_sweep.csv").unwrap();
-    let mut writer = BufWriter::new(file);
-    for line in lines.clone() {
-        writeln!(writer, "{}", line).unwrap();
-    }
+//         lines.push(format!(
+//             "{},{},{},{},{}",
+//             iterations, shard_size, batch_size, leaf_proving_duration, recursion_proving_duration
+//         ));
+//     }
 
-    println!("{:#?}", lines);
-}
+//     let file = File::create("scripts/results/tendermint_sweep.csv").unwrap();
+//     let mut writer = BufWriter::new(file);
+//     for line in lines.clone() {
+//         writeln!(writer, "{}", line).unwrap();
+//     }
+
+//     println!("{:#?}", lines);
+// }
diff --git a/crates/prover/src/build.rs b/crates/prover/src/build.rs
index 7b5feb1118..9466f2bf26 100644
--- a/crates/prover/src/build.rs
+++ b/crates/prover/src/build.rs
@@ -157,12 +157,12 @@ pub fn dummy_proof() -> (StarkVerifyingKey<OuterSC>, ShardProof<OuterSC>) {
     let context = SP1Context::default();
 
     tracing::info!("setup elf");
-    let (pk, vk) = prover.setup(elf);
+    let (_, pk_d, program, vk) = prover.setup(elf);
 
     tracing::info!("prove core");
     let mut stdin = SP1Stdin::new();
     stdin.write(&500u32);
-    let core_proof = prover.prove_core(&pk, &stdin, opts, context).unwrap();
+    let core_proof = prover.prove_core(&pk_d, program, &stdin, opts, context).unwrap();
 
     tracing::info!("compress");
     let compressed_proof = prover.compress(&vk, core_proof, vec![], opts).unwrap();
diff --git a/crates/prover/src/lib.rs b/crates/prover/src/lib.rs
index 75915e5bc0..fb1dd412b1 100644
--- a/crates/prover/src/lib.rs
+++ b/crates/prover/src/lib.rs
@@ -26,17 +26,18 @@ use std::{
     path::Path,
     sync::{
         atomic::{AtomicUsize, Ordering},
-        mpsc::sync_channel,
+        mpsc::{channel, sync_channel},
         Arc, Mutex, OnceLock,
     },
     thread,
 };
 
+use crate::shapes::SP1CompressProgramShape;
 use lru::LruCache;
 use p3_baby_bear::BabyBear;
-use p3_challenger::CanObserve;
 use p3_field::{AbstractField, PrimeField, PrimeField32};
 use p3_matrix::dense::RowMajorMatrix;
+use shapes::SP1ProofShape;
 use sp1_core_executor::{ExecutionError, ExecutionReport, Executor, Program, SP1Context};
 use sp1_core_machine::{
     io::SP1Stdin,
@@ -64,18 +65,20 @@ use sp1_recursion_compiler::{
     ir::{Builder, Witness},
 };
 use sp1_recursion_core::{
-    air::RecursionPublicValues, machine::RecursionAir, runtime::ExecutionRecord,
-    shape::RecursionShapeConfig, stark::BabyBearPoseidon2Outer, RecursionProgram,
-    Runtime as RecursionRuntime,
+    air::RecursionPublicValues,
+    machine::RecursionAir,
+    runtime::ExecutionRecord,
+    shape::{RecursionShape, RecursionShapeConfig},
+    stark::BabyBearPoseidon2Outer,
+    RecursionProgram, Runtime as RecursionRuntime,
 };
 pub use sp1_recursion_gnark_ffi::proof::{Groth16Bn254Proof, PlonkBn254Proof};
 use sp1_recursion_gnark_ffi::{groth16_bn254::Groth16Bn254Prover, plonk_bn254::PlonkBn254Prover};
-use sp1_stark::{air::InteractionScope, MachineProvingKey, ProofShape};
 use sp1_stark::{
-    air::PublicValues, baby_bear_poseidon2::BabyBearPoseidon2, Challenge, Challenger,
-    MachineProver, SP1CoreOpts, SP1ProverOpts, ShardProof, StarkGenericConfig, StarkVerifyingKey,
-    Val, Word, DIGEST_SIZE,
+    baby_bear_poseidon2::BabyBearPoseidon2, Challenge, MachineProver, SP1CoreOpts, SP1ProverOpts,
+    ShardProof, StarkGenericConfig, StarkVerifyingKey, Val, Word, DIGEST_SIZE,
 };
+use sp1_stark::{MachineProvingKey, ProofShape};
 use tracing::instrument;
 
 pub use types::*;
@@ -94,26 +97,34 @@ pub type InnerSC = BabyBearPoseidon2;
 /// The configuration for the outer prover.
 pub type OuterSC = BabyBearPoseidon2Outer;
 
+pub type DeviceProvingKey<C> = <<C as SP1ProverComponents>::CoreProver as MachineProver<
+    BabyBearPoseidon2,
+    RiscvAir<BabyBear>,
+>>::DeviceProvingKey;
+
 const COMPRESS_DEGREE: usize = 3;
 const SHRINK_DEGREE: usize = 3;
 const WRAP_DEGREE: usize = 9;
 
 const CORE_CACHE_SIZE: usize = 5;
-const COMPRESS_CACHE_SIZE: usize = 3;
 pub const REDUCE_BATCH_SIZE: usize = 2;
 
-// TODO: FIX
-//
-// const SHAPES_URL_PREFIX: &str = "https://sp1-circuits.s3.us-east-2.amazonaws.com/shapes";
-// const SHAPES_VERSION: &str = "146079e0e";
-// lazy_static! {
-//     static ref SHAPES_INIT: Once = Once::new();
-// }
-
 pub type CompressAir<F> = RecursionAir<F, COMPRESS_DEGREE>;
 pub type ShrinkAir<F> = RecursionAir<F, SHRINK_DEGREE>;
 pub type WrapAir<F> = RecursionAir<F, WRAP_DEGREE>;
 
+#[allow(clippy::type_complexity)]
+enum TracesOrInput {
+    ProgramRecordTraces(
+        Box<(
+            Arc<RecursionProgram<BabyBear>>,
+            ExecutionRecord<BabyBear>,
+            Vec<(String, RowMajorMatrix<BabyBear>)>,
+        )>,
+    ),
+    CircuitWitness(Box<SP1CircuitWitness>),
+}
+
 /// A end-to-end prover implementation for the SP1 RISC-V zkVM.
 pub struct SP1Prover<C: SP1ProverComponents = DefaultProverComponents> {
     /// The machine used for proving the core step.
@@ -132,8 +143,7 @@ pub struct SP1Prover<C: SP1ProverComponents = DefaultProverComponents> {
 
     pub recursion_cache_misses: AtomicUsize,
 
-    pub compress_programs:
-        Mutex<LruCache<SP1CompressWithVkeyShape, Arc<RecursionProgram<BabyBear>>>>,
+    pub compress_programs: BTreeMap<SP1CompressWithVkeyShape, Arc<RecursionProgram<BabyBear>>>,
 
     pub compress_cache_misses: AtomicUsize,
 
@@ -185,14 +195,6 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
         )
         .expect("PROVER_CORE_CACHE_SIZE must be a non-zero usize");
 
-        let compress_cache_size = NonZeroUsize::new(
-            env::var("PROVER_COMPRESS_CACHE_SIZE")
-                .unwrap_or_else(|_| CORE_CACHE_SIZE.to_string())
-                .parse()
-                .unwrap_or(COMPRESS_CACHE_SIZE),
-        )
-        .expect("PROVER_COMPRESS_CACHE_SIZE must be a non-zero usize");
-
         let core_shape_config = env::var("FIX_CORE_SHAPES")
             .map(|v| v.eq_ignore_ascii_case("true"))
             .unwrap_or(true)
@@ -205,8 +207,7 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
 
         let vk_verification =
             env::var("VERIFY_VK").map(|v| v.eq_ignore_ascii_case("true")).unwrap_or(false);
-
-        tracing::debug!("vk verification: {}", vk_verification);
+        tracing::info!("vk verification: {}", vk_verification);
 
         // Read the shapes from the shapes directory and deserialize them into memory.
         let allowed_vk_map: BTreeMap<[BabyBear; DIGEST_SIZE], usize> = if vk_verification {
@@ -217,6 +218,28 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
 
         let (root, merkle_tree) = MerkleTree::commit(allowed_vk_map.keys().copied().collect());
 
+        let mut compress_programs = BTreeMap::new();
+        if let Some(config) = &recursion_shape_config {
+            SP1ProofShape::generate_compress_shapes(config, REDUCE_BATCH_SIZE).for_each(|shape| {
+                let compress_shape = SP1CompressWithVkeyShape {
+                    compress_shape: shape.into(),
+                    merkle_tree_height: merkle_tree.height,
+                };
+                let input = SP1CompressWithVKeyWitnessValues::dummy(
+                    compress_prover.machine(),
+                    &compress_shape,
+                );
+                let program = compress_program_from_input::<C>(
+                    recursion_shape_config.as_ref(),
+                    &compress_prover,
+                    vk_verification,
+                    &input,
+                );
+                let program = Arc::new(program);
+                compress_programs.insert(compress_shape, program);
+            });
+        }
+
         Self {
             core_prover,
             compress_prover,
@@ -224,7 +247,7 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
             wrap_prover,
             recursion_programs: Mutex::new(LruCache::new(core_cache_size)),
             recursion_cache_misses: AtomicUsize::new(0),
-            compress_programs: Mutex::new(LruCache::new(compress_cache_size)),
+            compress_programs,
             compress_cache_misses: AtomicUsize::new(0),
             vk_root: root,
             vk_merkle_tree: merkle_tree,
@@ -237,13 +260,12 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
         }
     }
 
-    /// Fully initializes the programs, proving keys, and verifying keys that are normally
-    /// lazily initialized. TODO: remove this.
-    pub fn initialize(&mut self) {}
-
     /// Creates a proving key and a verifying key for a given RISC-V ELF.
     #[instrument(name = "setup", level = "debug", skip_all)]
-    pub fn setup(&self, elf: &[u8]) -> (SP1ProvingKey, SP1VerifyingKey) {
+    pub fn setup(
+        &self,
+        elf: &[u8],
+    ) -> (SP1ProvingKey, DeviceProvingKey<C>, Program, SP1VerifyingKey) {
         let program = self.get_program(elf).unwrap();
         let (pk, vk) = self.core_prover.setup(&program);
         let vk = SP1VerifyingKey { vk };
@@ -252,7 +274,8 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
             elf: elf.to_vec(),
             vk: vk.clone(),
         };
-        (pk, vk)
+        let pk_d = self.core_prover.pk_to_device(&pk.pk);
+        (pk, pk_d, program, vk)
     }
 
     /// Get a program with an allowed preprocessed shape.
@@ -289,333 +312,71 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
     #[instrument(name = "prove_core", level = "info", skip_all)]
     pub fn prove_core<'a>(
         &'a self,
-        pk: &SP1ProvingKey,
+        pk_d: &<<C as SP1ProverComponents>::CoreProver as MachineProver<
+            BabyBearPoseidon2,
+            RiscvAir<BabyBear>,
+        >>::DeviceProvingKey,
+        program: Program,
         stdin: &SP1Stdin,
         opts: SP1ProverOpts,
         mut context: SP1Context<'a>,
     ) -> Result<SP1CoreProof, SP1CoreProverError> {
         context.subproof_verifier.replace(Arc::new(self));
-        let program = self.get_program(&pk.elf).unwrap();
-        let pk = self.core_prover.pk_to_device(&pk.pk);
-        let (proof, public_values_stream, cycles) =
-            sp1_core_machine::utils::prove_with_context::<_, C::CoreProver>(
-                &self.core_prover,
-                &pk,
-                program,
-                stdin,
-                opts.core_opts,
-                context,
-                self.core_shape_config.as_ref(),
-            )?;
-        Self::check_for_high_cycles(cycles);
-        let public_values = SP1PublicValues::from(&public_values_stream);
-        Ok(SP1CoreProof {
-            proof: SP1CoreProofData(proof.shard_proofs),
-            stdin: stdin.clone(),
-            public_values,
-            cycles,
-        })
-    }
-
-    pub fn recursion_program(
-        &self,
-        input: &SP1RecursionWitnessValues<CoreSC>,
-    ) -> Arc<RecursionProgram<BabyBear>> {
-        let mut cache = self.recursion_programs.lock().unwrap_or_else(|e| e.into_inner());
-        cache
-            .get_or_insert(input.shape(), || {
-                let misses = self.recursion_cache_misses.fetch_add(1, Ordering::Relaxed);
-                tracing::debug!("core cache miss, misses: {}", misses);
-                // Get the operations.
-                let builder_span = tracing::debug_span!("build recursion program").entered();
-                let mut builder = Builder::<InnerConfig>::default();
-
-                let input = input.read(&mut builder);
-                SP1RecursiveVerifier::verify(&mut builder, self.core_prover.machine(), input);
-                let operations = builder.into_operations();
-                builder_span.exit();
-
-                // Compile the program.
-                let compiler_span = tracing::debug_span!("compile recursion program").entered();
-                let mut compiler = AsmCompiler::<InnerConfig>::default();
-                let mut program = compiler.compile(operations);
-                if let Some(recursion_shape_config) = &self.recursion_shape_config {
-                    recursion_shape_config.fix_shape(&mut program);
-                }
-                let program = Arc::new(program);
-                compiler_span.exit();
-                program
-            })
-            .clone()
-    }
-
-    pub fn compress_program(
-        &self,
-        input: &SP1CompressWithVKeyWitnessValues<InnerSC>,
-    ) -> Arc<RecursionProgram<BabyBear>> {
-        let mut cache = self.compress_programs.lock().unwrap_or_else(|e| e.into_inner());
-        let shape = input.shape();
-        cache
-            .get_or_insert(shape.clone(), || {
-                let misses = self.compress_cache_misses.fetch_add(1, Ordering::Relaxed);
-                tracing::debug!("compress cache miss, misses: {}", misses);
-                // Get the operations.
-                let builder_span = tracing::debug_span!("build compress program").entered();
-                let mut builder = Builder::<InnerConfig>::default();
-
-                // read the input.
-                let input = input.read(&mut builder);
-                // Verify the proof.
-                SP1CompressWithVKeyVerifier::verify(
-                    &mut builder,
-                    self.compress_prover.machine(),
-                    input,
-                    self.vk_verification,
-                    PublicValuesOutputDigest::Reduce,
-                );
-                let operations = builder.into_operations();
-                builder_span.exit();
-
-                // Compile the program.
-                let compiler_span = tracing::debug_span!("compile compress program").entered();
-                let mut compiler = AsmCompiler::<InnerConfig>::default();
-                let mut program = compiler.compile(operations);
-                if let Some(recursion_shape_config) = &self.recursion_shape_config {
-                    recursion_shape_config.fix_shape(&mut program);
-                }
-                let program = Arc::new(program);
-                compiler_span.exit();
-                program
-            })
-            .clone()
-    }
 
-    pub fn shrink_program(
-        &self,
-        input: &SP1CompressWithVKeyWitnessValues<InnerSC>,
-    ) -> Arc<RecursionProgram<BabyBear>> {
-        // Get the operations.
-        let builder_span = tracing::debug_span!("build shrink program").entered();
-        let mut builder = Builder::<InnerConfig>::default();
-        let input = input.read(&mut builder);
-        // Verify the proof.
-        SP1CompressRootVerifierWithVKey::verify(
-            &mut builder,
-            self.compress_prover.machine(),
-            input,
-            self.vk_verification,
-            PublicValuesOutputDigest::Reduce,
-        );
-        let operations = builder.into_operations();
-        builder_span.exit();
-
-        // Compile the program.
-        let compiler_span = tracing::debug_span!("compile shrink program").entered();
-        let mut compiler = AsmCompiler::<InnerConfig>::default();
-        let mut program = compiler.compile(operations);
-        program.shape = Some(ShrinkAir::<BabyBear>::shrink_shape());
-        let program = Arc::new(program);
-        compiler_span.exit();
-        program
-    }
-
-    pub fn wrap_program(&self) -> Arc<RecursionProgram<BabyBear>> {
-        self.wrap_program
-            .get_or_init(|| {
-                // Get the operations.
-                let builder_span = tracing::debug_span!("build compress program").entered();
-                let mut builder = Builder::<WrapConfig>::default();
-
-                let shrink_shape: ProofShape = ShrinkAir::<BabyBear>::shrink_shape().into();
-                let input_shape = SP1CompressShape::from(vec![shrink_shape]);
-                let shape = SP1CompressWithVkeyShape {
-                    compress_shape: input_shape,
-                    merkle_tree_height: self.vk_merkle_tree.height,
-                };
-                let dummy_input =
-                    SP1CompressWithVKeyWitnessValues::dummy(self.shrink_prover.machine(), &shape);
+        // Launch two threads to simultaneously prove the core and compile the first few
+        // recursion programs in parallel.
+        let span = tracing::Span::current().clone();
+        std::thread::scope(|s| {
+            let _span = span.enter();
+            let (proof_tx, proof_rx) = channel();
+            let (shape_tx, shape_rx) = channel();
+
+            let span = tracing::Span::current().clone();
+            let handle = s.spawn(move || {
+                let _span = span.enter();
+
+                // Copy the proving key to the device.
+                let pk = pk_d;
+
+                // Prove the core and stream the proofs and shapes.
+                sp1_core_machine::utils::prove_core_stream::<_, C::CoreProver>(
+                    &self.core_prover,
+                    pk,
+                    program,
+                    stdin,
+                    opts.core_opts,
+                    context,
+                    self.core_shape_config.as_ref(),
+                    proof_tx,
+                    shape_tx,
+                )
+            });
 
-                let input = dummy_input.read(&mut builder);
+            // Receive the first few shapes and comile the recursion programs.
+            for _ in 0..3 {
+                if let Ok((shape, is_complete)) = shape_rx.recv() {
+                    let compress_shape = SP1CompressProgramShape::Recursion(SP1RecursionShape {
+                        proof_shapes: vec![shape],
+                        is_complete,
+                    });
 
-                // Attest that the merkle tree root is correct.
-                let root = input.merkle_var.root;
-                for (val, expected) in root.iter().zip(self.vk_root.iter()) {
-                    builder.assert_felt_eq(*val, *expected);
+                    // Insert the program into the cache.
+                    self.program_from_shape(false, compress_shape, None);
                 }
-                // Verify the proof.
-                SP1CompressRootVerifierWithVKey::verify(
-                    &mut builder,
-                    self.shrink_prover.machine(),
-                    input,
-                    self.vk_verification,
-                    PublicValuesOutputDigest::Root,
-                );
-
-                let operations = builder.into_operations();
-                builder_span.exit();
-
-                // Compile the program.
-                let compiler_span = tracing::debug_span!("compile compress program").entered();
-                let mut compiler = AsmCompiler::<WrapConfig>::default();
-                let program = Arc::new(compiler.compile(operations));
-                compiler_span.exit();
-                program
-            })
-            .clone()
-    }
-
-    pub fn deferred_program(
-        &self,
-        input: &SP1DeferredWitnessValues<InnerSC>,
-    ) -> Arc<RecursionProgram<BabyBear>> {
-        // Compile the program.
-
-        // Get the operations.
-        let operations_span =
-            tracing::debug_span!("get operations for the deferred program").entered();
-        let mut builder = Builder::<InnerConfig>::default();
-        let input_read_span = tracing::debug_span!("Read input values").entered();
-        let input = input.read(&mut builder);
-        input_read_span.exit();
-        let verify_span = tracing::debug_span!("Verify deferred program").entered();
-
-        // Verify the proof.
-        SP1DeferredVerifier::verify(
-            &mut builder,
-            self.compress_prover.machine(),
-            input,
-            self.vk_verification,
-        );
-        verify_span.exit();
-        let operations = builder.into_operations();
-        operations_span.exit();
-
-        let compiler_span = tracing::debug_span!("compile deferred program").entered();
-        let mut compiler = AsmCompiler::<InnerConfig>::default();
-        let mut program = compiler.compile(operations);
-        if let Some(recursion_shape_config) = &self.recursion_shape_config {
-            recursion_shape_config.fix_shape(&mut program);
-        }
-        let program = Arc::new(program);
-        compiler_span.exit();
-        program
-    }
-
-    pub fn get_recursion_core_inputs(
-        &self,
-        vk: &StarkVerifyingKey<CoreSC>,
-        leaf_challenger: &Challenger<CoreSC>,
-        shard_proofs: &[ShardProof<CoreSC>],
-        batch_size: usize,
-        is_complete: bool,
-    ) -> Vec<SP1RecursionWitnessValues<CoreSC>> {
-        let mut core_inputs = Vec::new();
-        let mut reconstruct_challenger = self.core_prover.config().challenger();
-        vk.observe_into(&mut reconstruct_challenger);
-
-        // Prepare the inputs for the recursion programs.
-        for (batch_idx, batch) in shard_proofs.chunks(batch_size).enumerate() {
-            let proofs = batch.to_vec();
-
-            core_inputs.push(SP1RecursionWitnessValues {
-                vk: vk.clone(),
-                shard_proofs: proofs.clone(),
-                leaf_challenger: leaf_challenger.clone(),
-                initial_reconstruct_challenger: reconstruct_challenger.clone(),
-                is_complete,
-                is_first_shard: batch_idx == 0,
-                vk_root: self.vk_root,
-            });
-            assert_eq!(reconstruct_challenger.input_buffer.len(), 0);
-            assert_eq!(reconstruct_challenger.sponge_state.len(), 16);
-            assert_eq!(reconstruct_challenger.output_buffer.len(), 16);
-
-            for proof in batch.iter() {
-                reconstruct_challenger.observe(proof.commitment.global_main_commit);
-                reconstruct_challenger
-                    .observe_slice(&proof.public_values[0..self.core_prover.num_pv_elts()]);
             }
-        }
-
-        // Check that the leaf challenger is the same as the reconstruct challenger.
-        assert_eq!(reconstruct_challenger.sponge_state, leaf_challenger.sponge_state);
-        assert_eq!(reconstruct_challenger.input_buffer, leaf_challenger.input_buffer);
-        assert_eq!(reconstruct_challenger.output_buffer, leaf_challenger.output_buffer);
-        core_inputs
-    }
-
-    pub fn get_recursion_deferred_inputs<'a>(
-        &'a self,
-        vk: &'a StarkVerifyingKey<CoreSC>,
-        leaf_challenger: &'a Challenger<InnerSC>,
-        last_proof_pv: &PublicValues<Word<BabyBear>, BabyBear>,
-        deferred_proofs: &[SP1ReduceProof<InnerSC>],
-        batch_size: usize,
-    ) -> Vec<SP1DeferredWitnessValues<InnerSC>> {
-        // Prepare the inputs for the deferred proofs recursive verification.
-        let mut deferred_digest = [Val::<InnerSC>::zero(); DIGEST_SIZE];
-        let mut deferred_inputs = Vec::new();
 
-        for batch in deferred_proofs.chunks(batch_size) {
-            let vks_and_proofs =
-                batch.iter().cloned().map(|proof| (proof.vk, proof.proof)).collect::<Vec<_>>();
-
-            let input = SP1CompressWitnessValues { vks_and_proofs, is_complete: true };
-            let input = self.make_merkle_proofs(input);
-            let SP1CompressWithVKeyWitnessValues { compress_val, merkle_val } = input;
-
-            deferred_inputs.push(SP1DeferredWitnessValues {
-                vks_and_proofs: compress_val.vks_and_proofs,
-                vk_merkle_data: merkle_val,
-                start_reconstruct_deferred_digest: deferred_digest,
-                is_complete: false,
-                sp1_vk_digest: vk.hash_babybear(),
-                end_pc: Val::<InnerSC>::zero(),
-                end_shard: last_proof_pv.shard + BabyBear::one(),
-                end_execution_shard: last_proof_pv.execution_shard,
-                init_addr_bits: last_proof_pv.last_init_addr_bits,
-                finalize_addr_bits: last_proof_pv.last_finalize_addr_bits,
-                leaf_challenger: leaf_challenger.clone(),
-                committed_value_digest: last_proof_pv.committed_value_digest,
-                deferred_proofs_digest: last_proof_pv.deferred_proofs_digest,
-            });
-
-            deferred_digest = Self::hash_deferred_proofs(deferred_digest, batch);
-        }
-        deferred_inputs
-    }
-
-    /// Generate the inputs for the first layer of recursive proofs.
-    #[allow(clippy::type_complexity)]
-    pub fn get_first_layer_inputs<'a>(
-        &'a self,
-        vk: &'a SP1VerifyingKey,
-        leaf_challenger: &'a Challenger<InnerSC>,
-        shard_proofs: &[ShardProof<InnerSC>],
-        deferred_proofs: &[SP1ReduceProof<InnerSC>],
-        batch_size: usize,
-    ) -> Vec<SP1CircuitWitness> {
-        let is_complete = shard_proofs.len() == 1 && deferred_proofs.is_empty();
-        let core_inputs = self.get_recursion_core_inputs(
-            &vk.vk,
-            leaf_challenger,
-            shard_proofs,
-            batch_size,
-            is_complete,
-        );
-        let last_proof_pv = shard_proofs.last().unwrap().public_values.as_slice().borrow();
-        let deferred_inputs = self.get_recursion_deferred_inputs(
-            &vk.vk,
-            leaf_challenger,
-            last_proof_pv,
-            deferred_proofs,
-            batch_size,
-        );
-
-        let mut inputs = Vec::new();
-        inputs.extend(core_inputs.into_iter().map(SP1CircuitWitness::Core));
-        inputs.extend(deferred_inputs.into_iter().map(SP1CircuitWitness::Deferred));
-        inputs
+            // Collect the shard proofs and the public values stream.
+            let shard_proofs: Vec<ShardProof<_>> = proof_rx.iter().collect();
+            let (public_values_stream, cycles) = handle.join().unwrap().unwrap();
+            let public_values = SP1PublicValues::from(&public_values_stream);
+            Self::check_for_high_cycles(cycles);
+            Ok(SP1CoreProof {
+                proof: SP1CoreProofData(shard_proofs),
+                stdin: stdin.clone(),
+                public_values,
+                cycles,
+            })
+        })
     }
 
     /// Reduce shards proofs to a single shard proof using the recursion prover.
@@ -634,22 +395,9 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
 
         let shard_proofs = &proof.proof.0;
 
-        // Get the leaf challenger.
-        let mut leaf_challenger = self.core_prover.config().challenger();
-        vk.vk.observe_into(&mut leaf_challenger);
-        shard_proofs.iter().for_each(|proof| {
-            leaf_challenger.observe(proof.commitment.global_main_commit);
-            leaf_challenger.observe_slice(&proof.public_values[0..self.core_prover.num_pv_elts()]);
-        });
-
         // Generate the first layer inputs.
-        let first_layer_inputs = self.get_first_layer_inputs(
-            vk,
-            &leaf_challenger,
-            shard_proofs,
-            &deferred_proofs,
-            first_layer_batch_size,
-        );
+        let first_layer_inputs =
+            self.get_first_layer_inputs(vk, shard_proofs, &deferred_proofs, first_layer_batch_size);
 
         // Calculate the expected height of the tree.
         let mut expected_height = if first_layer_inputs.len() == 1 { 0 } else { 1 };
@@ -667,7 +415,7 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
 
             // Spawn a worker that sends the first layer inputs to a bounded channel.
             let input_sync = Arc::new(TurnBasedSync::new());
-            let (input_tx, input_rx) = sync_channel::<(usize, usize, SP1CircuitWitness)>(
+            let (input_tx, input_rx) = sync_channel::<(usize, usize, SP1CircuitWitness, bool)>(
                 opts.recursion_opts.checkpoints_channel_capacity,
             );
             let input_tx = Arc::new(Mutex::new(input_tx));
@@ -677,7 +425,7 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
                 s.spawn(move || {
                     for (index, input) in first_layer_inputs.into_iter().enumerate() {
                         input_sync.wait_for_turn(index);
-                        input_tx.lock().unwrap().send((index, 0, input)).unwrap();
+                        input_tx.lock().unwrap().send((index, 0, input, false)).unwrap();
                         input_sync.advance_turn();
                     }
                 });
@@ -686,13 +434,9 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
             // Spawn workers who generate the records and traces.
             let record_and_trace_sync = Arc::new(TurnBasedSync::new());
             let (record_and_trace_tx, record_and_trace_rx) =
-                sync_channel::<(
-                    usize,
-                    usize,
-                    Arc<RecursionProgram<BabyBear>>,
-                    ExecutionRecord<BabyBear>,
-                    Vec<(String, RowMajorMatrix<BabyBear>)>,
-                )>(opts.recursion_opts.records_and_traces_channel_capacity);
+                sync_channel::<(usize, usize, TracesOrInput)>(
+                    opts.recursion_opts.records_and_traces_channel_capacity,
+                );
             let record_and_trace_tx = Arc::new(Mutex::new(record_and_trace_tx));
             let record_and_trace_rx = Arc::new(Mutex::new(record_and_trace_rx));
             let input_rx = Arc::new(Mutex::new(input_rx));
@@ -705,7 +449,7 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
                     let _span = span.enter();
                     loop {
                         let received = { input_rx.lock().unwrap().recv() };
-                        if let Ok((index, height, input)) = received {
+                        if let Ok((index, height, input, false)) = received {
                             // Get the program and witness stream.
                             let (program, witness_stream) = tracing::debug_span!(
                                 "get program and witness stream"
@@ -731,7 +475,10 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
                                         &mut witness_stream,
                                     );
 
-                                    (self.compress_program(&input_with_merkle), witness_stream)
+                                    (
+                                        self.compress_program(false, &input_with_merkle),
+                                        witness_stream,
+                                    )
                                 }
                             });
 
@@ -764,19 +511,39 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
 
                             // Generate the traces.
                             let record = records.into_iter().next().unwrap();
-                            let traces = tracing::debug_span!("generate traces").in_scope(|| {
-                                self.compress_prover
-                                    .generate_traces(&record, InteractionScope::Local)
-                            });
+                            let traces = tracing::debug_span!("generate traces")
+                                .in_scope(|| self.compress_prover.generate_traces(&record));
+
+                            // Wait for our turn to update the state.
+                            record_and_trace_sync.wait_for_turn(index);
+
+                            // Send the record and traces to the worker.
+                            record_and_trace_tx
+                                .lock()
+                                .unwrap()
+                                .send((
+                                    index,
+                                    height,
+                                    TracesOrInput::ProgramRecordTraces(Box::new((
+                                        program, record, traces,
+                                    ))),
+                                ))
+                                .unwrap();
 
-                            // Wait for our turn to update the state.
+                            // Advance the turn.
+                            record_and_trace_sync.advance_turn();
+                        } else if let Ok((index, height, input, true)) = received {
                             record_and_trace_sync.wait_for_turn(index);
 
                             // Send the record and traces to the worker.
                             record_and_trace_tx
                                 .lock()
                                 .unwrap()
-                                .send((index, height, program, record, traces))
+                                .send((
+                                    index,
+                                    height,
+                                    TracesOrInput::CircuitWitness(Box::new(input)),
+                                ))
                                 .unwrap();
 
                             // Advance the turn.
@@ -806,7 +573,10 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
                     let _span = span.enter();
                     loop {
                         let received = { record_and_trace_rx.lock().unwrap().recv() };
-                        if let Ok((index, height, program, record, traces)) = received {
+                        if let Ok((index, height, TracesOrInput::ProgramRecordTraces(boxed_prt))) =
+                            received
+                        {
+                            let (program, record, traces) = *boxed_prt;
                             tracing::debug_span!("batch").in_scope(|| {
                                 // Get the keys.
                                 let (pk, vk) = tracing::debug_span!("Setup compress program")
@@ -826,30 +596,12 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
                                 );
 
                                 // Commit to the record and traces.
-                                let local_data = tracing::debug_span!("commit")
+                                let data = tracing::debug_span!("commit")
                                     .in_scope(|| self.compress_prover.commit(&record, traces));
 
-                                // Observe the commitment.
-                                tracing::debug_span!("observe public values").in_scope(|| {
-                                    challenger.observe_slice(
-                                        &local_data.public_values[0..self.compress_prover.num_pv_elts()],
-                                    );
-                                });
-
                                 // Generate the proof.
                                 let proof = tracing::debug_span!("open").in_scope(|| {
-                                    self.compress_prover
-                                        .open(
-                                            &pk,
-                                            None,
-                                            local_data,
-                                            &mut challenger,
-                                            &[
-                                                <BabyBearPoseidon2 as StarkGenericConfig>::Challenge::zero(),
-                                                <BabyBearPoseidon2 as StarkGenericConfig>::Challenge::zero(),
-                                            ],
-                                        )
-                                        .unwrap()
+                                    self.compress_prover.open(&pk, data, &mut challenger).unwrap()
                                 });
 
                                 // Verify the proof.
@@ -874,6 +626,31 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
                                 // Advance the turn.
                                 prover_sync.advance_turn();
                             });
+                        } else if let Ok((
+                            index,
+                            height,
+                            TracesOrInput::CircuitWitness(witness_box),
+                        )) = received
+                        {
+                            let witness = *witness_box;
+                            if let SP1CircuitWitness::Compress(inner_witness) = witness {
+                                let SP1CompressWitnessValues { vks_and_proofs, is_complete: _ } =
+                                    inner_witness;
+                                assert!(vks_and_proofs.len() == 1);
+                                let (vk, proof) = vks_and_proofs.last().unwrap();
+                                // Wait for our turn to update the state.
+                                prover_sync.wait_for_turn(index);
+
+                                // Send the proof.
+                                proofs_tx
+                                    .lock()
+                                    .unwrap()
+                                    .send((index, height, vk.clone(), proof.clone()))
+                                    .unwrap();
+
+                                // Advance the turn.
+                                prover_sync.advance_turn();
+                            }
                         } else {
                             break;
                         }
@@ -897,6 +674,9 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
                         ShardProof<InnerSC>,
                     )> = Vec::new();
                     loop {
+                        if expected_height == 0 {
+                            break;
+                        }
                         let received = { proofs_rx.lock().unwrap().recv() };
                         if let Ok((index, height, vk, proof)) = received {
                             batch.push((index, height, vk, proof));
@@ -934,7 +714,7 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
                             input_tx
                                 .lock()
                                 .unwrap()
-                                .send((count, next_input_height, input))
+                                .send((count, next_input_height, input, is_last))
                                 .unwrap();
                             input_sync.advance_turn();
                             count += 1;
@@ -990,7 +770,8 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
 
         let input_with_merkle = self.make_merkle_proofs(input);
 
-        let program = self.shrink_program(&input_with_merkle);
+        let program =
+            self.shrink_program(ShrinkAir::<BabyBear>::shrink_shape(), &input_with_merkle);
 
         // Run the compress program.
         let mut runtime = RecursionRuntime::<Val<InnerSC>, Challenge<InnerSC>, _>::new(
@@ -1143,6 +924,262 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
         proof
     }
 
+    pub fn recursion_program(
+        &self,
+        input: &SP1RecursionWitnessValues<CoreSC>,
+    ) -> Arc<RecursionProgram<BabyBear>> {
+        println!("getting recursion program: {:?}", input.shape());
+        let mut cache = self.recursion_programs.lock().unwrap_or_else(|e| e.into_inner());
+        println!("inserting to cache");
+        cache
+            .get_or_insert(input.shape(), || {
+                let misses = self.recursion_cache_misses.fetch_add(1, Ordering::Relaxed);
+                tracing::debug!("core cache miss, misses: {}", misses);
+                // Get the operations.
+                let builder_span = tracing::debug_span!("build recursion program").entered();
+                let mut builder = Builder::<InnerConfig>::default();
+
+                let input = input.read(&mut builder);
+                SP1RecursiveVerifier::verify(&mut builder, self.core_prover.machine(), input);
+                let operations = builder.into_operations();
+                builder_span.exit();
+
+                // Compile the program.
+                let compiler_span = tracing::debug_span!("compile recursion program").entered();
+                let mut compiler = AsmCompiler::<InnerConfig>::default();
+                let mut program = compiler.compile(operations);
+                if let Some(recursion_shape_config) = &self.recursion_shape_config {
+                    recursion_shape_config.fix_shape(&mut program);
+                }
+                let program = Arc::new(program);
+                compiler_span.exit();
+                program
+            })
+            .clone()
+    }
+
+    pub fn compress_program(
+        &self,
+        shape_tuning: bool,
+        input: &SP1CompressWithVKeyWitnessValues<InnerSC>,
+    ) -> Arc<RecursionProgram<BabyBear>> {
+        if self.recursion_shape_config.is_some() && !shape_tuning {
+            self.compress_programs.get(&input.shape()).map(Clone::clone).unwrap()
+        } else {
+            // Get the operations.
+            Arc::new(compress_program_from_input::<C>(
+                self.recursion_shape_config.as_ref(),
+                &self.compress_prover,
+                self.vk_verification,
+                input,
+            ))
+        }
+    }
+
+    pub fn shrink_program(
+        &self,
+        shrink_shape: RecursionShape,
+        input: &SP1CompressWithVKeyWitnessValues<InnerSC>,
+    ) -> Arc<RecursionProgram<BabyBear>> {
+        // Get the operations.
+        let builder_span = tracing::debug_span!("build shrink program").entered();
+        let mut builder = Builder::<InnerConfig>::default();
+        let input = input.read(&mut builder);
+        // Verify the proof.
+        SP1CompressRootVerifierWithVKey::verify(
+            &mut builder,
+            self.compress_prover.machine(),
+            input,
+            self.vk_verification,
+            PublicValuesOutputDigest::Reduce,
+        );
+        let operations = builder.into_operations();
+        builder_span.exit();
+
+        // Compile the program.
+        let compiler_span = tracing::debug_span!("compile shrink program").entered();
+        let mut compiler = AsmCompiler::<InnerConfig>::default();
+        let mut program = compiler.compile(operations);
+
+        program.shape = Some(shrink_shape);
+        let program = Arc::new(program);
+        compiler_span.exit();
+        program
+    }
+
+    pub fn wrap_program(&self) -> Arc<RecursionProgram<BabyBear>> {
+        self.wrap_program
+            .get_or_init(|| {
+                // Get the operations.
+                let builder_span = tracing::debug_span!("build compress program").entered();
+                let mut builder = Builder::<WrapConfig>::default();
+
+                let shrink_shape: ProofShape = ShrinkAir::<BabyBear>::shrink_shape().into();
+                let input_shape = SP1CompressShape::from(vec![shrink_shape]);
+                let shape = SP1CompressWithVkeyShape {
+                    compress_shape: input_shape,
+                    merkle_tree_height: self.vk_merkle_tree.height,
+                };
+                let dummy_input =
+                    SP1CompressWithVKeyWitnessValues::dummy(self.shrink_prover.machine(), &shape);
+
+                let input = dummy_input.read(&mut builder);
+
+                // Attest that the merkle tree root is correct.
+                let root = input.merkle_var.root;
+                for (val, expected) in root.iter().zip(self.vk_root.iter()) {
+                    builder.assert_felt_eq(*val, *expected);
+                }
+                // Verify the proof.
+                SP1CompressRootVerifierWithVKey::verify(
+                    &mut builder,
+                    self.shrink_prover.machine(),
+                    input,
+                    self.vk_verification,
+                    PublicValuesOutputDigest::Root,
+                );
+
+                let operations = builder.into_operations();
+                builder_span.exit();
+
+                // Compile the program.
+                let compiler_span = tracing::debug_span!("compile compress program").entered();
+                let mut compiler = AsmCompiler::<WrapConfig>::default();
+                let program = Arc::new(compiler.compile(operations));
+                compiler_span.exit();
+                program
+            })
+            .clone()
+    }
+
+    pub fn deferred_program(
+        &self,
+        input: &SP1DeferredWitnessValues<InnerSC>,
+    ) -> Arc<RecursionProgram<BabyBear>> {
+        // Compile the program.
+
+        // Get the operations.
+        let operations_span =
+            tracing::debug_span!("get operations for the deferred program").entered();
+        let mut builder = Builder::<InnerConfig>::default();
+        let input_read_span = tracing::debug_span!("Read input values").entered();
+        let input = input.read(&mut builder);
+        input_read_span.exit();
+        let verify_span = tracing::debug_span!("Verify deferred program").entered();
+
+        // Verify the proof.
+        SP1DeferredVerifier::verify(
+            &mut builder,
+            self.compress_prover.machine(),
+            input,
+            self.vk_verification,
+        );
+        verify_span.exit();
+        let operations = builder.into_operations();
+        operations_span.exit();
+
+        let compiler_span = tracing::debug_span!("compile deferred program").entered();
+        let mut compiler = AsmCompiler::<InnerConfig>::default();
+        let mut program = compiler.compile(operations);
+        if let Some(recursion_shape_config) = &self.recursion_shape_config {
+            recursion_shape_config.fix_shape(&mut program);
+        }
+        let program = Arc::new(program);
+        compiler_span.exit();
+        program
+    }
+
+    pub fn get_recursion_core_inputs(
+        &self,
+        vk: &StarkVerifyingKey<CoreSC>,
+        shard_proofs: &[ShardProof<CoreSC>],
+        batch_size: usize,
+        is_complete: bool,
+        deferred_digest: [Val<CoreSC>; 8],
+    ) -> Vec<SP1RecursionWitnessValues<CoreSC>> {
+        let mut core_inputs = Vec::new();
+
+        // Prepare the inputs for the recursion programs.
+        for (batch_idx, batch) in shard_proofs.chunks(batch_size).enumerate() {
+            let proofs = batch.to_vec();
+
+            core_inputs.push(SP1RecursionWitnessValues {
+                vk: vk.clone(),
+                shard_proofs: proofs.clone(),
+                is_complete,
+                is_first_shard: batch_idx == 0,
+                vk_root: self.vk_root,
+                reconstruct_deferred_digest: deferred_digest,
+            });
+        }
+        core_inputs
+    }
+
+    pub fn get_recursion_deferred_inputs<'a>(
+        &'a self,
+        vk: &'a StarkVerifyingKey<CoreSC>,
+        deferred_proofs: &[SP1ReduceProof<InnerSC>],
+        batch_size: usize,
+    ) -> (Vec<SP1DeferredWitnessValues<InnerSC>>, [BabyBear; 8]) {
+        // Prepare the inputs for the deferred proofs recursive verification.
+        let mut deferred_digest = [Val::<InnerSC>::zero(); DIGEST_SIZE];
+        let mut deferred_inputs = Vec::new();
+
+        for batch in deferred_proofs.chunks(batch_size) {
+            let vks_and_proofs =
+                batch.iter().cloned().map(|proof| (proof.vk, proof.proof)).collect::<Vec<_>>();
+
+            let input = SP1CompressWitnessValues { vks_and_proofs, is_complete: true };
+            let input = self.make_merkle_proofs(input);
+            let SP1CompressWithVKeyWitnessValues { compress_val, merkle_val } = input;
+
+            deferred_inputs.push(SP1DeferredWitnessValues {
+                vks_and_proofs: compress_val.vks_and_proofs,
+                vk_merkle_data: merkle_val,
+                start_reconstruct_deferred_digest: deferred_digest,
+                is_complete: false,
+                sp1_vk_digest: vk.hash_babybear(),
+                end_pc: vk.pc_start,
+                end_shard: BabyBear::one(),
+                end_execution_shard: BabyBear::one(),
+                init_addr_bits: [BabyBear::zero(); 32],
+                finalize_addr_bits: [BabyBear::zero(); 32],
+                committed_value_digest: [Word::<BabyBear>([BabyBear::zero(); 4]); 8],
+                deferred_proofs_digest: [BabyBear::zero(); 8],
+            });
+
+            deferred_digest = Self::hash_deferred_proofs(deferred_digest, batch);
+        }
+        (deferred_inputs, deferred_digest)
+    }
+
+    /// Generate the inputs for the first layer of recursive proofs.
+    #[allow(clippy::type_complexity)]
+    pub fn get_first_layer_inputs<'a>(
+        &'a self,
+        vk: &'a SP1VerifyingKey,
+        shard_proofs: &[ShardProof<InnerSC>],
+        deferred_proofs: &[SP1ReduceProof<InnerSC>],
+        batch_size: usize,
+    ) -> Vec<SP1CircuitWitness> {
+        let (deferred_inputs, deferred_digest) =
+            self.get_recursion_deferred_inputs(&vk.vk, deferred_proofs, batch_size);
+
+        let is_complete = shard_proofs.len() == 1 && deferred_proofs.is_empty();
+        let core_inputs = self.get_recursion_core_inputs(
+            &vk.vk,
+            shard_proofs,
+            batch_size,
+            is_complete,
+            deferred_digest,
+        );
+
+        let mut inputs = Vec::new();
+        inputs.extend(deferred_inputs.into_iter().map(SP1CircuitWitness::Deferred));
+        inputs.extend(core_inputs.into_iter().map(SP1CircuitWitness::Core));
+        inputs
+    }
+
     /// Accumulate deferred proofs into a single digest.
     pub fn hash_deferred_proofs(
         prev_digest: [Val<CoreSC>; DIGEST_SIZE],
@@ -1215,6 +1252,39 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
     }
 }
 
+pub fn compress_program_from_input<C: SP1ProverComponents>(
+    config: Option<&RecursionShapeConfig<BabyBear, CompressAir<BabyBear>>>,
+    compress_prover: &C::CompressProver,
+    vk_verification: bool,
+    input: &SP1CompressWithVKeyWitnessValues<BabyBearPoseidon2>,
+) -> RecursionProgram<BabyBear> {
+    let builder_span = tracing::debug_span!("build compress program").entered();
+    let mut builder = Builder::<InnerConfig>::default();
+    // read the input.
+    let input = input.read(&mut builder);
+    // Verify the proof.
+    SP1CompressWithVKeyVerifier::verify(
+        &mut builder,
+        compress_prover.machine(),
+        input,
+        vk_verification,
+        PublicValuesOutputDigest::Reduce,
+    );
+    let operations = builder.into_operations();
+    builder_span.exit();
+
+    // Compile the program.
+    let compiler_span = tracing::debug_span!("compile compress program").entered();
+    let mut compiler = AsmCompiler::<InnerConfig>::default();
+    let mut program = compiler.compile(operations);
+    if let Some(config) = config {
+        config.fix_shape(&mut program);
+    }
+    compiler_span.exit();
+
+    program
+}
+
 #[cfg(any(test, feature = "export-tests"))]
 pub mod tests {
 
@@ -1282,10 +1352,10 @@ pub mod tests {
         let context = SP1Context::default();
 
         tracing::info!("setup elf");
-        let (pk, vk) = prover.setup(elf);
+        let (_, pk_d, program, vk) = prover.setup(elf);
 
         tracing::info!("prove core");
-        let core_proof = prover.prove_core(&pk, &stdin, opts, context)?;
+        let core_proof = prover.prove_core(&pk_d, program, &stdin, opts, context)?;
         let public_values = core_proof.public_values.clone();
 
         if env::var("COLLECT_SHAPES").is_ok() {
@@ -1421,16 +1491,22 @@ pub mod tests {
         let prover = SP1Prover::<C>::new();
 
         tracing::info!("setup keccak elf");
-        let (keccak_pk, keccak_vk) = prover.setup(keccak_elf);
+        let (_, keccak_pk_d, keccak_program, keccak_vk) = prover.setup(keccak_elf);
 
         tracing::info!("setup verify elf");
-        let (verify_pk, verify_vk) = prover.setup(verify_elf);
+        let (_, verify_pk_d, verify_program, verify_vk) = prover.setup(verify_elf);
 
         tracing::info!("prove subproof 1");
         let mut stdin = SP1Stdin::new();
         stdin.write(&1usize);
         stdin.write(&vec![0u8, 0, 0]);
-        let deferred_proof_1 = prover.prove_core(&keccak_pk, &stdin, opts, Default::default())?;
+        let deferred_proof_1 = prover.prove_core(
+            &keccak_pk_d,
+            keccak_program.clone(),
+            &stdin,
+            opts,
+            Default::default(),
+        )?;
         let pv_1 = deferred_proof_1.public_values.as_slice().to_vec().clone();
 
         // Generate a second proof of keccak of various inputs.
@@ -1440,16 +1516,19 @@ pub mod tests {
         stdin.write(&vec![0u8, 1, 2]);
         stdin.write(&vec![2, 3, 4]);
         stdin.write(&vec![5, 6, 7]);
-        let deferred_proof_2 = prover.prove_core(&keccak_pk, &stdin, opts, Default::default())?;
+        let deferred_proof_2 =
+            prover.prove_core(&keccak_pk_d, keccak_program, &stdin, opts, Default::default())?;
         let pv_2 = deferred_proof_2.public_values.as_slice().to_vec().clone();
 
         // Generate recursive proof of first subproof.
         tracing::info!("compress subproof 1");
         let deferred_reduce_1 = prover.compress(&keccak_vk, deferred_proof_1, vec![], opts)?;
+        prover.verify_compressed(&deferred_reduce_1, &keccak_vk)?;
 
         // Generate recursive proof of second subproof.
         tracing::info!("compress subproof 2");
         let deferred_reduce_2 = prover.compress(&keccak_vk, deferred_proof_2, vec![], opts)?;
+        prover.verify_compressed(&deferred_reduce_2, &keccak_vk)?;
 
         // Run verify program with keccak vkey, subproofs, and their committed values.
         let mut stdin = SP1Stdin::new();
@@ -1467,7 +1546,8 @@ pub mod tests {
         stdin.write_proof(deferred_reduce_2.clone(), keccak_vk.vk.clone());
 
         tracing::info!("proving verify program (core)");
-        let verify_proof = prover.prove_core(&verify_pk, &stdin, opts, Default::default())?;
+        let verify_proof =
+            prover.prove_core(&verify_pk_d, verify_program, &stdin, opts, Default::default())?;
         // let public_values = verify_proof.public_values.clone();
 
         // Generate recursive proof of verify program
@@ -1517,6 +1597,7 @@ pub mod tests {
         // docker image which has a different API than the current. So we need to wait until the
         // next release (v1.2.0+), and then switch it back.
         let prover = SP1Prover::<DefaultProverComponents>::new();
+
         test_e2e_prover::<DefaultProverComponents>(
             &prover,
             elf,
@@ -1535,13 +1616,13 @@ pub mod tests {
         test_e2e_with_deferred_proofs_prover::<DefaultProverComponents>(SP1ProverOpts::default())
     }
 
-    #[test]
-    fn test_deterministic_setup() {
-        setup_logger();
-        let prover = SP1Prover::<DefaultProverComponents>::new();
-        let program = test_artifacts::FIBONACCI_ELF;
-        let (pk, _) = prover.setup(program);
-        let pk2 = prover.setup(program).0;
-        assert_eq!(pk.pk.commit, pk2.pk.commit);
-    }
+    // #[test]
+    // fn test_deterministic_setup() {
+    //     setup_logger();
+    //     let prover = SP1Prover::<DefaultProverComponents>::new();
+    //     let program = test_artifacts::FIBONACCI_ELF;
+    //     let (pk, vk) = prover.setup(&program);
+    //     let pk2 = prover.setup(&program).0;
+    //     assert_eq!(pk.pk.commit, pk2.pk.commit);
+    // }
 }
diff --git a/crates/prover/src/shapes.rs b/crates/prover/src/shapes.rs
index 74f7ba177e..95f5b2874d 100644
--- a/crates/prover/src/shapes.rs
+++ b/crates/prover/src/shapes.rs
@@ -8,20 +8,23 @@ use std::{
 };
 
 use eyre::Result;
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 
 use p3_baby_bear::BabyBear;
 use p3_field::AbstractField;
-use serde::{Deserialize, Serialize};
 use sp1_core_machine::riscv::CoreShapeConfig;
 use sp1_recursion_circuit::machine::{
     SP1CompressWithVKeyWitnessValues, SP1CompressWithVkeyShape, SP1DeferredShape,
     SP1DeferredWitnessValues, SP1RecursionShape, SP1RecursionWitnessValues,
 };
-use sp1_recursion_core::{shape::RecursionShapeConfig, RecursionProgram};
+use sp1_recursion_core::{
+    shape::{RecursionShape, RecursionShapeConfig},
+    RecursionProgram,
+};
 use sp1_stark::{MachineProver, ProofShape, DIGEST_SIZE};
 
-use crate::{components::SP1ProverComponents, CompressAir, HashableKey, SP1Prover};
+use crate::{components::SP1ProverComponents, CompressAir, HashableKey, SP1Prover, ShrinkAir};
 
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
 pub enum SP1ProofShape {
@@ -55,6 +58,79 @@ pub enum VkBuildError {
     Bincode(#[from] bincode::Error),
 }
 
+pub fn check_shapes<C: SP1ProverComponents>(
+    reduce_batch_size: usize,
+    no_precompiles: bool,
+    num_compiler_workers: usize,
+    prover: &SP1Prover<C>,
+) -> bool {
+    let (shape_tx, shape_rx) =
+        std::sync::mpsc::sync_channel::<SP1CompressProgramShape>(num_compiler_workers);
+    let (panic_tx, panic_rx) = std::sync::mpsc::channel();
+    let core_shape_config = prover.core_shape_config.as_ref().expect("core shape config not found");
+    let recursion_shape_config =
+        prover.recursion_shape_config.as_ref().expect("recursion shape config not found");
+
+    let shape_rx = Mutex::new(shape_rx);
+
+    let all_maximal_shapes = SP1ProofShape::generate_maximal_shapes(
+        core_shape_config,
+        recursion_shape_config,
+        reduce_batch_size,
+        no_precompiles,
+    )
+    .collect::<BTreeSet<SP1ProofShape>>();
+    let num_shapes = all_maximal_shapes.len();
+    tracing::info!("number of shapes: {}", num_shapes);
+
+    // The Merkle tree height.
+    let height = num_shapes.next_power_of_two().ilog2() as usize;
+
+    let compress_ok = std::thread::scope(|s| {
+        // Initialize compiler workers.
+        for _ in 0..num_compiler_workers {
+            let shape_rx = &shape_rx;
+            let prover = &prover;
+            let panic_tx = panic_tx.clone();
+            s.spawn(move || {
+                while let Ok(shape) = shape_rx.lock().unwrap().recv() {
+                    tracing::info!("shape is {:?}", shape);
+                    let program = catch_unwind(AssertUnwindSafe(|| {
+                        // Try to build the recursion program from the given shape.
+                        prover.program_from_shape(true, shape.clone(), None)
+                    }));
+                    match program {
+                        Ok(_) => {}
+                        Err(e) => {
+                            tracing::warn!(
+                                "Program generation failed for shape {:?}, with error: {:?}",
+                                shape,
+                                e
+                            );
+                            panic_tx.send(true).unwrap();
+                        }
+                    }
+                }
+            });
+        }
+
+        // Generate shapes and send them to the compiler workers.
+        all_maximal_shapes.into_iter().for_each(|program_shape| {
+            shape_tx
+                .send(SP1CompressProgramShape::from_proof_shape(program_shape, height))
+                .unwrap();
+        });
+
+        drop(shape_tx);
+        drop(panic_tx);
+
+        // If the panic receiver has no panics, then the shape is correct.
+        panic_rx.iter().next().is_none()
+    });
+
+    compress_ok
+}
+
 pub fn build_vk_map<C: SP1ProverComponents>(
     reduce_batch_size: usize,
     dummy: bool,
@@ -111,7 +187,7 @@ pub fn build_vk_map<C: SP1ProverComponents>(
                     while let Ok((i, shape)) = shape_rx.lock().unwrap().recv() {
                         println!("shape {} is {:?}", i, shape);
                         let program = catch_unwind(AssertUnwindSafe(|| {
-                            prover.program_from_shape(shape.clone())
+                            prover.program_from_shape(false, shape.clone(), None)
                         }));
                         let is_shrink = matches!(shape, SP1CompressProgramShape::Shrink(_));
                         match program {
@@ -244,10 +320,40 @@ impl SP1ProofShape {
     pub fn generate_compress_shapes(
         recursion_shape_config: &'_ RecursionShapeConfig<BabyBear, CompressAir<BabyBear>>,
         reduce_batch_size: usize,
-    ) -> impl Iterator<Item = Self> + '_ {
-        (1..=reduce_batch_size).flat_map(|batch_size| {
-            recursion_shape_config.get_all_shape_combinations(batch_size).map(Self::Compress)
-        })
+    ) -> impl Iterator<Item = Vec<ProofShape>> + '_ {
+        recursion_shape_config.get_all_shape_combinations(reduce_batch_size)
+    }
+
+    pub fn generate_maximal_shapes<'a>(
+        core_shape_config: &'a CoreShapeConfig<BabyBear>,
+        recursion_shape_config: &'a RecursionShapeConfig<BabyBear, CompressAir<BabyBear>>,
+        reduce_batch_size: usize,
+        no_precompiles: bool,
+    ) -> impl Iterator<Item = Self> + 'a {
+        let core_shape_iter = if no_precompiles {
+            core_shape_config.maximal_core_shapes().into_iter()
+        } else {
+            core_shape_config.maximal_core_plus_precompile_shapes().into_iter()
+        };
+        core_shape_iter
+            .map(|core_shape| {
+                Self::Recursion(ProofShape {
+                    chip_information: core_shape.inner.into_iter().collect(),
+                })
+            })
+            .chain((1..=reduce_batch_size).flat_map(|batch_size| {
+                recursion_shape_config.get_all_shape_combinations(batch_size).map(Self::Compress)
+            }))
+            .chain(
+                recursion_shape_config
+                    .get_all_shape_combinations(1)
+                    .map(|mut x| Self::Deferred(x.pop().unwrap())),
+            )
+            .chain(
+                recursion_shape_config
+                    .get_all_shape_combinations(1)
+                    .map(|mut x| Self::Shrink(x.pop().unwrap())),
+            )
     }
 
     pub fn dummy_vk_map<'a>(
@@ -284,7 +390,9 @@ impl SP1CompressProgramShape {
 impl<C: SP1ProverComponents> SP1Prover<C> {
     pub fn program_from_shape(
         &self,
+        shape_tuning: bool,
         shape: SP1CompressProgramShape,
+        shrink_shape: Option<RecursionShape>,
     ) -> Arc<RecursionProgram<BabyBear>> {
         match shape {
             SP1CompressProgramShape::Recursion(shape) => {
@@ -298,12 +406,15 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
             SP1CompressProgramShape::Compress(shape) => {
                 let input =
                     SP1CompressWithVKeyWitnessValues::dummy(self.compress_prover.machine(), &shape);
-                self.compress_program(&input)
+                self.compress_program(shape_tuning, &input)
             }
             SP1CompressProgramShape::Shrink(shape) => {
                 let input =
                     SP1CompressWithVKeyWitnessValues::dummy(self.compress_prover.machine(), &shape);
-                self.shrink_program(&input)
+                self.shrink_program(
+                    shrink_shape.unwrap_or_else(ShrinkAir::<BabyBear>::shrink_shape),
+                    &input,
+                )
             }
         }
     }
diff --git a/crates/prover/src/verify.rs b/crates/prover/src/verify.rs
index 1a2ff25502..115b52a0e0 100644
--- a/crates/prover/src/verify.rs
+++ b/crates/prover/src/verify.rs
@@ -228,12 +228,14 @@ impl<C: SP1ProverComponents> SP1Prover<C> {
         // - `deferred_proofs_digest` should be zero.
         //
         // Transition:
-        // - If `committed_value_digest_prev` is not zero, then `committed_value_digest` should equal
+        // - If `committed_value_digest_prev` is not zero, then `committed_value_digest` should
+        //   equal
         //  `committed_value_digest_prev`. Otherwise, `committed_value_digest` should equal zero.
         // - If `deferred_proofs_digest_prev` is not zero, then `deferred_proofs_digest` should
         //   equal
         //  `deferred_proofs_digest_prev`. Otherwise, `deferred_proofs_digest` should equal zero.
-        // - If it's not a shard with "CPU", then `committed_value_digest` should not change from the
+        // - If it's not a shard with "CPU", then `committed_value_digest` should not change from
+        //   the
         //  previous shard.
         // - If it's not a shard with "CPU", then `deferred_proofs_digest` should not change from
         //   the
diff --git a/crates/recursion/circuit/src/constraints.rs b/crates/recursion/circuit/src/constraints.rs
index fa3c506764..d18dc555af 100644
--- a/crates/recursion/circuit/src/constraints.rs
+++ b/crates/recursion/circuit/src/constraints.rs
@@ -34,10 +34,11 @@ where
     A: MachineAir<C::F> + for<'a> Air<RecursiveVerifierConstraintFolder<'a, C>>,
 {
     #[allow(clippy::too_many_arguments)]
+    #[allow(clippy::type_complexity)]
     pub fn verify_constraints(
         builder: &mut Builder<C>,
         chip: &MachineChip<SC, A>,
-        opening: &ChipOpenedValues<Ext<C::F, C::EF>>,
+        opening: &ChipOpenedValues<Felt<C::F>, Ext<C::F, C::EF>>,
         trace_domain: TwoAdicMultiplicativeCoset<C::F>,
         qc_domains: Vec<TwoAdicMultiplicativeCoset<C::F>>,
         zeta: Ext<C::F, C::EF>,
@@ -65,10 +66,11 @@ where
         builder.assert_ext_eq(folded_constraints * sels.inv_zeroifier, quotient);
     }
 
+    #[allow(clippy::type_complexity)]
     pub fn eval_constraints(
         builder: &mut Builder<C>,
         chip: &MachineChip<SC, A>,
-        opening: &ChipOpenedValues<Ext<C::F, C::EF>>,
+        opening: &ChipOpenedValues<Felt<C::F>, Ext<C::F, C::EF>>,
         selectors: &LagrangeSelectors<Ext<C::F, C::EF>>,
         alpha: Ext<C::F, C::EF>,
         permutation_challenges: &[Ext<C::F, C::EF>],
@@ -101,7 +103,8 @@ where
             main: opening.main.view(),
             perm: perm_opening.view(),
             perm_challenges: permutation_challenges,
-            cumulative_sums: &[opening.global_cumulative_sum, opening.local_cumulative_sum],
+            local_cumulative_sum: &opening.local_cumulative_sum,
+            global_cumulative_sum: &opening.global_cumulative_sum,
             public_values,
             is_first_row: selectors.is_first_row,
             is_last_row: selectors.is_last_row,
@@ -115,9 +118,10 @@ where
         builder.eval(folder.accumulator)
     }
 
+    #[allow(clippy::type_complexity)]
     pub fn recompute_quotient(
         builder: &mut Builder<C>,
-        opening: &ChipOpenedValues<Ext<C::F, C::EF>>,
+        opening: &ChipOpenedValues<Felt<C::F>, Ext<C::F, C::EF>>,
         qc_domains: &[TwoAdicMultiplicativeCoset<C::F>],
         zeta: Ext<C::F, C::EF>,
     ) -> Ext<C::F, C::EF> {
@@ -151,8 +155,9 @@ where
                             - C::F::one();
                         (
                             {
-                                // We use the precomputed powers of zeta to compute (inline) the value of
-                                // `other_domain.zp_at_point_variable(builder, zeta)`.
+                                // We use the precomputed powers of zeta to compute (inline) the
+                                // value of `other_domain.
+                                // zp_at_point_variable(builder, zeta)`.
                                 let z: Ext<_, _> = builder.eval(
                                     zetas[other_domain.log_n] * SymbolicFelt::from_f(shift_power)
                                         - SymbolicExt::from_f(C::EF::one()),
@@ -189,9 +194,10 @@ where
         )
     }
 
+    #[allow(clippy::type_complexity)]
     pub fn verify_opening_shape(
         chip: &MachineChip<SC, A>,
-        opening: &ChipOpenedValues<Ext<C::F, C::EF>>,
+        opening: &ChipOpenedValues<Felt<C::F>, Ext<C::F, C::EF>>,
     ) -> Result<(), OpeningShapeError> {
         // Verify that the preprocessed width matches the expected value for the chip.
         if opening.preprocessed.local.len() != chip.preprocessed_width() {
diff --git a/crates/recursion/circuit/src/fri.rs b/crates/recursion/circuit/src/fri.rs
index ea343bea4e..68af777b22 100644
--- a/crates/recursion/circuit/src/fri.rs
+++ b/crates/recursion/circuit/src/fri.rs
@@ -404,8 +404,8 @@ pub fn dummy_query_proof(
     }
 }
 
-/// Make a dummy PCS proof for a given proof shape. Used to generate vkey information for fixed proof
-/// shapes.
+/// Make a dummy PCS proof for a given proof shape. Used to generate vkey information for fixed
+/// proof shapes.
 ///
 /// The parameter `batch_shapes` contains (width, height) data for each matrix in each batch.
 pub fn dummy_pcs_proof(
diff --git a/crates/recursion/circuit/src/hash.rs b/crates/recursion/circuit/src/hash.rs
index bb499172d1..f312b812ce 100644
--- a/crates/recursion/circuit/src/hash.rs
+++ b/crates/recursion/circuit/src/hash.rs
@@ -1,5 +1,7 @@
-use std::fmt::Debug;
-use std::iter::{repeat, zip};
+use std::{
+    fmt::Debug,
+    iter::{repeat, zip},
+};
 
 use itertools::Itertools;
 use p3_baby_bear::BabyBear;
@@ -11,11 +13,11 @@ use sp1_recursion_compiler::{
     circuit::CircuitV2Builder,
     ir::{Builder, Config, DslIr, Felt, Var},
 };
-use sp1_recursion_core::stark::{outer_perm, OUTER_MULTI_FIELD_CHALLENGER_WIDTH};
-use sp1_recursion_core::{stark::BabyBearPoseidon2Outer, DIGEST_SIZE};
-use sp1_recursion_core::{HASH_RATE, PERMUTATION_WIDTH};
-use sp1_stark::baby_bear_poseidon2::BabyBearPoseidon2;
-use sp1_stark::inner_perm;
+use sp1_recursion_core::{
+    stark::{outer_perm, BabyBearPoseidon2Outer, OUTER_MULTI_FIELD_CHALLENGER_WIDTH},
+    DIGEST_SIZE, HASH_RATE, PERMUTATION_WIDTH,
+};
+use sp1_stark::{baby_bear_poseidon2::BabyBearPoseidon2, inner_perm};
 
 use crate::{
     challenger::{reduce_32, POSEIDON_2_BB_RATE},
diff --git a/crates/recursion/circuit/src/machine/complete.rs b/crates/recursion/circuit/src/machine/complete.rs
index 58df24f42e..02aaf61217 100644
--- a/crates/recursion/circuit/src/machine/complete.rs
+++ b/crates/recursion/circuit/src/machine/complete.rs
@@ -1,6 +1,7 @@
 use itertools::Itertools;
+use p3_baby_bear::BabyBear;
 use p3_field::AbstractField;
-
+use sp1_recursion_compiler::circuit::CircuitV2Builder;
 use sp1_recursion_compiler::ir::{Builder, Config, Felt};
 use sp1_recursion_core::air::RecursionPublicValues;
 
@@ -8,7 +9,7 @@ use sp1_recursion_core::air::RecursionPublicValues;
 ///
 /// The assertions consist of checking all the expected boundary conditions from a compress proof
 /// that represents the end of the recursion tower.
-pub(crate) fn assert_complete<C: Config>(
+pub(crate) fn assert_complete<C: Config<F = BabyBear>>(
     builder: &mut Builder<C>,
     public_values: &RecursionPublicValues<Felt<C::F>>,
     is_complete: Felt<C::F>,
@@ -19,11 +20,9 @@ pub(crate) fn assert_complete<C: Config>(
         start_shard,
         next_shard,
         start_execution_shard,
-        cumulative_sum,
         start_reconstruct_deferred_digest,
         end_reconstruct_deferred_digest,
-        leaf_challenger,
-        end_reconstruct_challenger,
+        global_cumulative_sum,
         contains_execution_shard,
         ..
     } = public_values;
@@ -48,13 +47,6 @@ pub(crate) fn assert_complete<C: Config>(
     // Assert that the start execution shard is equal to 1.
     builder.assert_felt_eq(is_complete * (*start_execution_shard - C::F::one()), C::F::zero());
 
-    // Assert that the end reconstruct challenger is equal to the leaf challenger.
-    for (end_challenger_d, leaf_challenger_d) in
-        end_reconstruct_challenger.into_iter().zip(*leaf_challenger)
-    {
-        builder.assert_felt_eq(is_complete * (end_challenger_d - leaf_challenger_d), C::F::zero());
-    }
-
     // The start reconstruct deferred digest should be zero.
     for start_digest_word in start_reconstruct_deferred_digest {
         builder.assert_felt_eq(is_complete * *start_digest_word, C::F::zero());
@@ -68,8 +60,5 @@ pub(crate) fn assert_complete<C: Config>(
             .assert_felt_eq(is_complete * (*end_digest_word - *deferred_digest_word), C::F::zero());
     }
 
-    // Assert that the cumulative sum is zero.
-    for b in cumulative_sum.iter() {
-        builder.assert_felt_eq(is_complete * *b, C::F::zero());
-    }
+    builder.assert_digest_zero_v2(is_complete, *global_cumulative_sum);
 }
diff --git a/crates/recursion/circuit/src/machine/compress.rs b/crates/recursion/circuit/src/machine/compress.rs
index fe99eb43c2..4cbb6f6adf 100644
--- a/crates/recursion/circuit/src/machine/compress.rs
+++ b/crates/recursion/circuit/src/machine/compress.rs
@@ -15,12 +15,9 @@ use p3_field::AbstractField;
 use p3_matrix::dense::RowMajorMatrix;
 
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use sp1_recursion_compiler::ir::{Builder, Ext, Felt, SymbolicFelt};
+use sp1_recursion_compiler::ir::{Builder, Felt, SymbolicFelt};
 
-use sp1_recursion_core::{
-    air::{ChallengerPublicValues, RecursionPublicValues, RECURSIVE_PROOF_NUM_PV_ELTS},
-    D,
-};
+use sp1_recursion_core::air::{RecursionPublicValues, RECURSIVE_PROOF_NUM_PV_ELTS};
 
 use sp1_stark::{
     air::{MachineAir, POSEIDON_NUM_WORDS, PV_DIGEST_NUM_WORDS},
@@ -37,10 +34,10 @@ use crate::{
         root_public_values_digest,
     },
     stark::{dummy_vk_and_shard_proof, ShardProofVariable, StarkVerifier},
-    utils::uninit_challenger_pv,
     BabyBearFriConfig, BabyBearFriConfigVariable, CircuitConfig, VerifyingKeyVariable,
 };
 
+use sp1_recursion_compiler::circuit::CircuitV2Builder;
 /// A program to verify a batch of recursive proofs and aggregate their public values.
 #[derive(Debug, Clone, Copy)]
 pub struct SP1CompressVerifier<C, SC, A> {
@@ -127,12 +124,6 @@ where
         let mut exit_code: Felt<_> = builder.uninit();
 
         let mut execution_shard: Felt<_> = unsafe { MaybeUninit::zeroed().assume_init() };
-        let mut initial_reconstruct_challenger_values: ChallengerPublicValues<Felt<C::F>> =
-            unsafe { uninit_challenger_pv(builder) };
-        let mut reconstruct_challenger_values: ChallengerPublicValues<Felt<C::F>> =
-            unsafe { uninit_challenger_pv(builder) };
-        let mut leaf_challenger_values: ChallengerPublicValues<Felt<C::F>> =
-            unsafe { uninit_challenger_pv(builder) };
         let mut committed_value_digest: [Word<Felt<_>>; PV_DIGEST_NUM_WORDS] =
             array::from_fn(|_| {
                 Word(array::from_fn(|_| unsafe { MaybeUninit::zeroed().assume_init() }))
@@ -141,8 +132,7 @@ where
             array::from_fn(|_| unsafe { MaybeUninit::zeroed().assume_init() });
         let mut reconstruct_deferred_digest: [Felt<_>; POSEIDON_NUM_WORDS] =
             core::array::from_fn(|_| unsafe { MaybeUninit::zeroed().assume_init() });
-        let mut global_cumulative_sum: [Felt<_>; D] =
-            core::array::from_fn(|_| builder.eval(C::F::zero()));
+        let mut global_cumulative_sums = Vec::new();
         let mut init_addr_bits: [Felt<_>; 32] =
             core::array::from_fn(|_| unsafe { MaybeUninit::zeroed().assume_init() });
         let mut finalize_addr_bits: [Felt<_>; 32] =
@@ -162,10 +152,11 @@ where
             // Observe the vk and start pc.
             challenger.observe(builder, vk.commitment);
             challenger.observe(builder, vk.pc_start);
+            challenger.observe_slice(builder, vk.initial_global_cumulative_sum.0.x.0);
+            challenger.observe_slice(builder, vk.initial_global_cumulative_sum.0.y.0);
+            // Observe the padding.
             let zero: Felt<_> = builder.eval(C::F::zero());
-            for _ in 0..7 {
-                challenger.observe(builder, zero);
-            }
+            challenger.observe(builder, zero);
 
             // Observe the main commitment and public values.
             challenger.observe_slice(
@@ -173,15 +164,7 @@ where
                 shard_proof.public_values[0..machine.num_pv_elts()].iter().copied(),
             );
 
-            let zero_ext: Ext<C::F, C::EF> = builder.eval(C::F::zero());
-            StarkVerifier::verify_shard(
-                builder,
-                &vk,
-                machine,
-                &mut challenger,
-                &shard_proof,
-                &[zero_ext, zero_ext],
-            );
+            StarkVerifier::verify_shard(builder, &vk, machine, &mut challenger, &shard_proof);
 
             // Get the current public values.
             let current_public_values: &RecursionPublicValues<Felt<C::F>> =
@@ -251,14 +234,6 @@ where
                     *first_bit = *current_bit;
                 }
 
-                // Initialize the leaf challenger public values.
-                leaf_challenger_values = current_public_values.leaf_challenger;
-
-                // Initialize the initial reconstruct challenger public values.
-                initial_reconstruct_challenger_values =
-                    current_public_values.start_reconstruct_challenger;
-                reconstruct_challenger_values = current_public_values.start_reconstruct_challenger;
-
                 // Assign the committed values and deferred proof digests.
                 for (word, current_word) in committed_value_digest
                     .iter_mut()
@@ -310,7 +285,7 @@ where
                 );
                 // A flag to indicate whether the first execution shard has been seen. We have:
                 // - `is_first_execution_shard_seen`  = current_contains_execution_shard &&
-                //                                     !execution_shard_seen_before.
+                //   !execution_shard_seen_before.
                 // Since `contains_execution_shard` is the boolean flag used to denote if we have
                 // seen an execution shard, we can use it to denote if we have seen an execution
                 // shard before.
@@ -354,25 +329,10 @@ where
                 builder.assert_felt_eq(*bit, *current_bit);
             }
 
-            // Assert that the leaf challenger is always the same.
-            for (current, expected) in
-                leaf_challenger_values.into_iter().zip(current_public_values.leaf_challenger)
-            {
-                builder.assert_felt_eq(current, expected);
-            }
-
-            // Assert that the current challenger matches the start reconstruct challenger.
-            for (current, expected) in reconstruct_challenger_values
-                .into_iter()
-                .zip(current_public_values.start_reconstruct_challenger)
-            {
-                builder.assert_felt_eq(current, expected);
-            }
-
             // Digest constraints.
             {
-                // If `committed_value_digest` is not zero, then `public_values.committed_value_digest
-                // should be the current.
+                // If `committed_value_digest` is not zero, then
+                // `public_values.committed_value_digest should be the current.
 
                 // Set a flags to indicate whether `committed_value_digest` is non-zero. The flags
                 // are given by the elements of the array, and they will be used as filters to
@@ -442,8 +402,8 @@ where
 
             // If the current shard has an execution shard, then we update the flag in case it was
             // not already set. That is:
-            // - If the current shard has an execution shard and the flag is set to zero, it will
-            //   be set to one.
+            // - If the current shard has an execution shard and the flag is set to zero, it will be
+            //   set to one.
             // - If the current shard has an execution shard and the flag is set to one, it will
             //   remain set to one.
             contains_execution_shard = builder.eval(
@@ -489,17 +449,11 @@ where
                 *bit = *next_bit;
             }
 
-            // Update the reconstruct challenger.
-            reconstruct_challenger_values = current_public_values.end_reconstruct_challenger;
-
-            // Update the cumulative sum.
-            for (sum_element, current_sum_element) in
-                global_cumulative_sum.iter_mut().zip_eq(current_public_values.cumulative_sum.iter())
-            {
-                *sum_element = builder.eval(*sum_element + *current_sum_element);
-            }
+            global_cumulative_sums.push(current_public_values.global_cumulative_sum);
         }
 
+        let global_cumulative_sum = builder.sum_digest_v2(global_cumulative_sums);
+
         // Update the global values from the last accumulated values.
         // Set sp1_vk digest to the one from the proof values.
         compress_public_values.sp1_vk_digest = sp1_vk_digest;
@@ -513,12 +467,6 @@ where
         compress_public_values.last_init_addr_bits = init_addr_bits;
         // Set the MemoryFinalize address bits to be the last MemoryFinalize address bits.
         compress_public_values.last_finalize_addr_bits = finalize_addr_bits;
-        // Set the leaf challenger to it's value.
-        compress_public_values.leaf_challenger = leaf_challenger_values;
-        // Set the start reconstruct challenger to be the initial reconstruct challenger.
-        compress_public_values.start_reconstruct_challenger = initial_reconstruct_challenger_values;
-        // Set the end reconstruct challenger to be the last reconstruct challenger.
-        compress_public_values.end_reconstruct_challenger = reconstruct_challenger_values;
         // Set the start reconstruct deferred digest to be the last reconstruct deferred digest.
         compress_public_values.end_reconstruct_deferred_digest = reconstruct_deferred_digest;
         // Assign the deferred proof digests.
@@ -526,7 +474,7 @@ where
         // Assign the committed value digests.
         compress_public_values.committed_value_digest = committed_value_digest;
         // Assign the cumulative sum.
-        compress_public_values.cumulative_sum = global_cumulative_sum;
+        compress_public_values.global_cumulative_sum = global_cumulative_sum;
         // Assign the `is_complete` flag.
         compress_public_values.is_complete = is_complete;
         // Set the contains an execution shard flag.
diff --git a/crates/recursion/circuit/src/machine/core.rs b/crates/recursion/circuit/src/machine/core.rs
index 3ed9376c45..c7ca95f93a 100644
--- a/crates/recursion/circuit/src/machine/core.rs
+++ b/crates/recursion/circuit/src/machine/core.rs
@@ -18,6 +18,8 @@ use sp1_core_machine::{
 };
 
 use sp1_recursion_core::air::PV_DIGEST_NUM_WORDS;
+use sp1_stark::air::InteractionScope;
+use sp1_stark::air::MachineAir;
 use sp1_stark::{
     air::{PublicValues, POSEIDON_NUM_WORDS},
     baby_bear_poseidon2::BabyBearPoseidon2,
@@ -28,7 +30,7 @@ use sp1_stark::{ShardProof, StarkGenericConfig, StarkVerifyingKey};
 
 use sp1_recursion_compiler::{
     circuit::CircuitV2Builder,
-    ir::{Builder, Config, Ext, ExtConst, Felt, SymbolicFelt},
+    ir::{Builder, Config, Felt, SymbolicFelt},
 };
 
 use sp1_recursion_core::{
@@ -37,9 +39,9 @@ use sp1_recursion_core::{
 };
 
 use crate::{
-    challenger::{CanObserveVariable, DuplexChallengerVariable, FieldChallengerVariable},
+    challenger::{CanObserveVariable, DuplexChallengerVariable},
     machine::recursion_public_values_digest,
-    stark::{dummy_challenger, dummy_vk_and_shard_proof, ShardProofVariable, StarkVerifier},
+    stark::{dummy_vk_and_shard_proof, ShardProofVariable, StarkVerifier},
     BabyBearFriConfig, BabyBearFriConfigVariable, CircuitConfig, VerifyingKeyVariable,
 };
 
@@ -49,8 +51,7 @@ pub struct SP1RecursionWitnessVariable<
 > {
     pub vk: VerifyingKeyVariable<C, SC>,
     pub shard_proofs: Vec<ShardProofVariable<C, SC>>,
-    pub leaf_challenger: SC::FriChallengerVariable,
-    pub initial_reconstruct_challenger: DuplexChallengerVariable<C>,
+    pub reconstruct_deferred_digest: [Felt<C::F>; DIGEST_SIZE],
     pub is_complete: Felt<C::F>,
     pub is_first_shard: Felt<C::F>,
     pub vk_root: [Felt<C::F>; DIGEST_SIZE],
@@ -62,11 +63,10 @@ pub struct SP1RecursionWitnessVariable<
 pub struct SP1RecursionWitnessValues<SC: StarkGenericConfig> {
     pub vk: StarkVerifyingKey<SC>,
     pub shard_proofs: Vec<ShardProof<SC>>,
-    pub leaf_challenger: SC::Challenger,
-    pub initial_reconstruct_challenger: SC::Challenger,
     pub is_complete: bool,
     pub is_first_shard: bool,
     pub vk_root: [SC::Val; DIGEST_SIZE],
+    pub reconstruct_deferred_digest: [SC::Val; 8],
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
@@ -126,11 +126,10 @@ where
         let SP1RecursionWitnessVariable {
             vk,
             shard_proofs,
-            leaf_challenger,
-            initial_reconstruct_challenger,
             is_complete,
             is_first_shard,
             vk_root,
+            reconstruct_deferred_digest,
         } = input;
 
         // Initialize shard variables.
@@ -166,13 +165,8 @@ where
         let mut deferred_proofs_digest: [Felt<_>; POSEIDON_NUM_WORDS] =
             array::from_fn(|_| builder.uninit());
 
-        // Initialize the challenger variables.
-        let leaf_challenger_public_values = leaf_challenger.public_values(builder);
-        let mut reconstruct_challenger: DuplexChallengerVariable<_> =
-            initial_reconstruct_challenger.copy(builder);
-
         // Initialize the cumulative sum.
-        let mut global_cumulative_sum: Ext<_, _> = builder.eval(C::EF::zero().cons());
+        let mut global_cumulative_sums = Vec::new();
 
         // Assert that the number of proofs is not zero.
         assert!(!shard_proofs.is_empty());
@@ -259,22 +253,14 @@ where
                     C::F::one(),
                 );
 
-                // If the initial shard is the first shard, we assert that the initial challenger
-                // is the same as a fresh challenger that absorbed the verifying key.
-                let mut first_shard_challenger = machine.config().challenger_variable(builder);
-                vk.observe_into(builder, &mut first_shard_challenger);
-                let first_challenger_public_values = first_shard_challenger.public_values(builder);
-                let initial_challenger_public_values =
-                    initial_reconstruct_challenger.public_values(builder);
-                for (first, initial) in
-                    first_challenger_public_values.into_iter().zip(initial_challenger_public_values)
-                {
-                    builder.assert_felt_eq(is_first_shard * (first - initial), C::F::zero());
-                }
-
                 // If it's the first shard (which is the first execution shard), then the `start_pc`
                 // should be vk.pc_start.
                 builder.assert_felt_eq(is_first_shard * (start_pc - vk.pc_start), C::F::zero());
+                // If it's the first shard, we add the vk's `initial_global_cumulative_sum` to the digest.
+                global_cumulative_sums.push(builder.select_global_cumulative_sum(
+                    is_first_shard,
+                    vk.initial_global_cumulative_sum,
+                ));
 
                 // Assert that `init_addr_bits` and `finalize_addr_bits` are zero for the first
                 for bit in current_init_addr_bits.iter() {
@@ -289,19 +275,26 @@ where
             //
             // Do not verify the cumulative sum here, since the permutation challenge is shared
             // between all shards.
-            let mut challenger = leaf_challenger.copy(builder);
 
-            let global_permutation_challenges =
-                (0..2).map(|_| challenger.sample_ext(builder)).collect::<Vec<_>>();
+            // Prepare a challenger.
+            let mut challenger = machine.config().challenger_variable(builder);
 
-            StarkVerifier::verify_shard(
+            // Observe the vk and start pc.
+            challenger.observe(builder, vk.commitment);
+            challenger.observe(builder, vk.pc_start);
+            challenger.observe_slice(builder, vk.initial_global_cumulative_sum.0.x.0);
+            challenger.observe_slice(builder, vk.initial_global_cumulative_sum.0.y.0);
+            // Observe the padding.
+            let zero: Felt<_> = builder.eval(C::F::zero());
+            challenger.observe(builder, zero);
+
+            challenger.observe_slice(
                 builder,
-                &vk,
-                machine,
-                &mut challenger,
-                &shard_proof,
-                &global_permutation_challenges,
+                shard_proof.public_values[0..machine.num_pv_elts()].iter().copied(),
             );
+            StarkVerifier::verify_shard(builder, &vk, machine, &mut challenger, &shard_proof);
+
+            let chips = machine.shard_chips_ordered(&shard_proof.chip_ordering).collect::<Vec<_>>();
 
             // Assert that first shard has a "CPU". Equivalently, assert that if the shard does
             // not have a "CPU", then the current shard is not 1.
@@ -426,8 +419,8 @@ where
 
             // Digest constraints.
             {
-                // // If `committed_value_digest` is not zero, then the current value should be equal
-                // to `public_values.committed_value_digest`.
+                // // If `committed_value_digest` is not zero, then the current value should be
+                // equal to `public_values.committed_value_digest`.
 
                 // Set flags to indicate whether `committed_value_digest` is non-zero. The flags are
                 // given by the elements of the array, and they will be used as filters to constrain
@@ -522,19 +515,16 @@ where
             // have shard < 2^{MAX_LOG_NUMBER_OF_SHARDS}.
             C::range_check_felt(builder, public_values.shard, MAX_LOG_NUMBER_OF_SHARDS);
 
-            // Update the reconstruct challenger.
-            reconstruct_challenger.observe(builder, shard_proof.commitment.global_main_commit);
-            for element in shard_proof.public_values.iter().take(machine.num_pv_elts()) {
-                reconstruct_challenger.observe(builder, *element);
-            }
-
             // Cumulative sum is updated by sums of all chips.
-            for values in shard_proof.opened_values.chips.iter() {
-                global_cumulative_sum =
-                    builder.eval(global_cumulative_sum + values.global_cumulative_sum);
+            for (chip, values) in chips.iter().zip(shard_proof.opened_values.chips.iter()) {
+                if chip.commit_scope() == InteractionScope::Global {
+                    global_cumulative_sums.push(values.global_cumulative_sum);
+                }
             }
         }
 
+        let global_cumulative_sum = builder.sum_digest_v2(global_cumulative_sums);
+
         // Assert that the last exit code is zero.
         builder.assert_felt_eq(exit_code, C::F::zero());
 
@@ -543,20 +533,8 @@ where
             // Compute the vk digest.
             let vk_digest = vk.hash(builder);
 
-            // Collect the public values for challengers.
-            let initial_challenger_public_values =
-                initial_reconstruct_challenger.public_values(builder);
-            let final_challenger_public_values = reconstruct_challenger.public_values(builder);
-
-            // Collect the cumulative sum.
-            let global_cumulative_sum_array = builder.ext2felt_v2(global_cumulative_sum);
-
-            // Collect the deferred proof digests.
-            let zero: Felt<_> = builder.eval(C::F::zero());
-            let start_deferred_digest = [zero; POSEIDON_NUM_WORDS];
-            let end_deferred_digest = [zero; POSEIDON_NUM_WORDS];
-
             // Initialize the public values we will commit to.
+            let zero: Felt<_> = builder.eval(C::F::zero());
             let mut recursion_public_values_stream = [zero; RECURSIVE_PROOF_NUM_PV_ELTS];
             let recursion_public_values: &mut RecursionPublicValues<_> =
                 recursion_public_values_stream.as_mut_slice().borrow_mut();
@@ -574,12 +552,9 @@ where
                 initial_previous_finalize_addr_bits;
             recursion_public_values.last_finalize_addr_bits = current_finalize_addr_bits;
             recursion_public_values.sp1_vk_digest = vk_digest;
-            recursion_public_values.leaf_challenger = leaf_challenger_public_values;
-            recursion_public_values.start_reconstruct_challenger = initial_challenger_public_values;
-            recursion_public_values.end_reconstruct_challenger = final_challenger_public_values;
-            recursion_public_values.cumulative_sum = global_cumulative_sum_array;
-            recursion_public_values.start_reconstruct_deferred_digest = start_deferred_digest;
-            recursion_public_values.end_reconstruct_deferred_digest = end_deferred_digest;
+            recursion_public_values.global_cumulative_sum = global_cumulative_sum;
+            recursion_public_values.start_reconstruct_deferred_digest = reconstruct_deferred_digest;
+            recursion_public_values.end_reconstruct_deferred_digest = reconstruct_deferred_digest;
             recursion_public_values.exit_code = exit_code;
             recursion_public_values.is_complete = is_complete;
             // Set the contains an execution shard flag.
@@ -615,8 +590,7 @@ impl SP1RecursionWitnessValues<BabyBearPoseidon2> {
         Self {
             vk,
             shard_proofs,
-            leaf_challenger: dummy_challenger(machine.config()),
-            initial_reconstruct_challenger: dummy_challenger(machine.config()),
+            reconstruct_deferred_digest: [BabyBear::zero(); DIGEST_SIZE],
             is_complete: shape.is_complete,
             is_first_shard: false,
             vk_root: [BabyBear::zero(); DIGEST_SIZE],
diff --git a/crates/recursion/circuit/src/machine/deferred.rs b/crates/recursion/circuit/src/machine/deferred.rs
index d5ab720973..f4306bfa8a 100644
--- a/crates/recursion/circuit/src/machine/deferred.rs
+++ b/crates/recursion/circuit/src/machine/deferred.rs
@@ -10,10 +10,10 @@ use p3_baby_bear::BabyBear;
 use p3_commit::Mmcs;
 use p3_field::AbstractField;
 use p3_matrix::dense::RowMajorMatrix;
-
 use sp1_primitives::consts::WORD_SIZE;
-use sp1_recursion_compiler::ir::{Builder, Ext, Felt};
-
+use sp1_recursion_compiler::ir::{Builder, Felt};
+use sp1_stark::septic_curve::SepticCurve;
+use sp1_stark::septic_digest::SepticDigest;
 use sp1_stark::{
     air::{MachineAir, POSEIDON_NUM_WORDS},
     baby_bear_poseidon2::BabyBearPoseidon2,
@@ -30,7 +30,7 @@ use crate::{
     constraints::RecursiveVerifierConstraintFolder,
     hash::{FieldHasher, FieldHasherVariable},
     machine::assert_recursion_public_values_valid,
-    stark::{dummy_challenger, ShardProofVariable, StarkVerifier},
+    stark::{ShardProofVariable, StarkVerifier},
     BabyBearFriConfig, BabyBearFriConfigVariable, CircuitConfig, VerifyingKeyVariable,
 };
 
@@ -61,7 +61,6 @@ pub struct SP1DeferredWitnessValues<SC: BabyBearFriConfig + FieldHasher<BabyBear
     pub vk_merkle_data: SP1MerkleProofWitnessValues<SC>,
     pub start_reconstruct_deferred_digest: [SC::Val; POSEIDON_NUM_WORDS],
     pub sp1_vk_digest: [SC::Val; DIGEST_SIZE],
-    pub leaf_challenger: SC::Challenger,
     pub committed_value_digest: [Word<SC::Val>; PV_DIGEST_NUM_WORDS],
     pub deferred_proofs_digest: [SC::Val; POSEIDON_NUM_WORDS],
     pub end_pc: SC::Val,
@@ -80,7 +79,6 @@ pub struct SP1DeferredWitnessVariable<
     pub vk_merkle_data: SP1MerkleProofWitnessVariable<C, SC>,
     pub start_reconstruct_deferred_digest: [Felt<C::F>; POSEIDON_NUM_WORDS],
     pub sp1_vk_digest: [Felt<C::F>; DIGEST_SIZE],
-    pub leaf_challenger: SC::FriChallengerVariable,
     pub committed_value_digest: [Word<Felt<C::F>>; PV_DIGEST_NUM_WORDS],
     pub deferred_proofs_digest: [Felt<C::F>; POSEIDON_NUM_WORDS],
     pub end_pc: Felt<C::F>,
@@ -122,7 +120,6 @@ where
             vk_merkle_data,
             start_reconstruct_deferred_digest,
             sp1_vk_digest,
-            leaf_challenger,
             committed_value_digest,
             deferred_proofs_digest,
             end_pc,
@@ -157,10 +154,11 @@ where
             // Observe the vk and start pc.
             challenger.observe(builder, vk.commitment);
             challenger.observe(builder, vk.pc_start);
+            challenger.observe_slice(builder, vk.initial_global_cumulative_sum.0.x.0);
+            challenger.observe_slice(builder, vk.initial_global_cumulative_sum.0.y.0);
+            // Observe the padding.
             let zero: Felt<_> = builder.eval(C::F::zero());
-            for _ in 0..7 {
-                challenger.observe(builder, zero);
-            }
+            challenger.observe(builder, zero);
 
             // Observe the and public values.
             challenger.observe_slice(
@@ -168,15 +166,7 @@ where
                 shard_proof.public_values[0..machine.num_pv_elts()].iter().copied(),
             );
 
-            let zero_ext: Ext<C::F, C::EF> = builder.eval(C::F::zero());
-            StarkVerifier::verify_shard(
-                builder,
-                &vk,
-                machine,
-                &mut challenger,
-                &shard_proof,
-                &[zero_ext, zero_ext],
-            );
+            StarkVerifier::verify_shard(builder, &vk, machine, &mut challenger, &shard_proof);
 
             // Get the current public values.
             let current_public_values: &RecursionPublicValues<Felt<C::F>> =
@@ -232,11 +222,6 @@ where
         // Set the deferred proof digest to be the hitned value.
         deferred_public_values.deferred_proofs_digest = deferred_proofs_digest;
 
-        // Set the initial, end, and leaf challenger to be the hitned values.
-        let values = leaf_challenger.public_values(builder);
-        deferred_public_values.leaf_challenger = values;
-        deferred_public_values.start_reconstruct_challenger = values;
-        deferred_public_values.end_reconstruct_challenger = values;
         // Set the exit code to be zero for now.
         deferred_public_values.exit_code = builder.eval(C::F::zero());
         // Assign the deferred proof digests.
@@ -246,7 +231,10 @@ where
         // Set the `contains_execution_shard` flag.
         deferred_public_values.contains_execution_shard = builder.eval(C::F::zero());
         // Set the cumulative sum to zero.
-        deferred_public_values.cumulative_sum = array::from_fn(|_| builder.eval(C::F::zero()));
+        deferred_public_values.global_cumulative_sum =
+            SepticDigest(SepticCurve::convert(SepticDigest::<C::F>::zero().0, |value| {
+                builder.eval(value)
+            }));
         // Set the vk root from the witness.
         deferred_public_values.vk_root = vk_root;
         // Set the digest according to the previous values.
@@ -271,7 +259,6 @@ impl SP1DeferredWitnessValues<BabyBearPoseidon2> {
         Self {
             vks_and_proofs,
             vk_merkle_data,
-            leaf_challenger: dummy_challenger(machine.config()),
             is_complete: true,
             sp1_vk_digest: [BabyBear::zero(); DIGEST_SIZE],
             start_reconstruct_deferred_digest: [BabyBear::zero(); POSEIDON_NUM_WORDS],
diff --git a/crates/recursion/circuit/src/machine/witness.rs b/crates/recursion/circuit/src/machine/witness.rs
index 11b79831e1..debf82ea2d 100644
--- a/crates/recursion/circuit/src/machine/witness.rs
+++ b/crates/recursion/circuit/src/machine/witness.rs
@@ -89,14 +89,22 @@ where
     fn read(&self, builder: &mut Builder<C>) -> Self::WitnessVariable {
         let commitment = self.commit.read(builder);
         let pc_start = self.pc_start.read(builder);
+        let initial_global_cumulative_sum = self.initial_global_cumulative_sum.read(builder);
         let chip_information = self.chip_information.clone();
         let chip_ordering = self.chip_ordering.clone();
-        VerifyingKeyVariable { commitment, pc_start, chip_information, chip_ordering }
+        VerifyingKeyVariable {
+            commitment,
+            pc_start,
+            initial_global_cumulative_sum,
+            chip_information,
+            chip_ordering,
+        }
     }
 
     fn write(&self, witness: &mut impl WitnessWriter<C>) {
         self.commit.write(witness);
         self.pc_start.write(witness);
+        self.initial_global_cumulative_sum.write(witness);
     }
 }
 
@@ -109,18 +117,16 @@ where
     fn read(&self, builder: &mut Builder<C>) -> Self::WitnessVariable {
         let vk = self.vk.read(builder);
         let shard_proofs = self.shard_proofs.read(builder);
-        let leaf_challenger = self.leaf_challenger.read(builder);
-        let initial_reconstruct_challenger = self.initial_reconstruct_challenger.read(builder);
+        let reconstruct_deferred_digest = self.reconstruct_deferred_digest.read(builder);
         let is_complete = InnerVal::from_bool(self.is_complete).read(builder);
         let is_first_shard = InnerVal::from_bool(self.is_first_shard).read(builder);
         let vk_root = self.vk_root.read(builder);
         SP1RecursionWitnessVariable {
             vk,
             shard_proofs,
-            leaf_challenger,
-            initial_reconstruct_challenger,
             is_complete,
             is_first_shard,
+            reconstruct_deferred_digest,
             vk_root,
         }
     }
@@ -128,8 +134,7 @@ where
     fn write(&self, witness: &mut impl WitnessWriter<C>) {
         self.vk.write(witness);
         self.shard_proofs.write(witness);
-        self.leaf_challenger.write(witness);
-        self.initial_reconstruct_challenger.write(witness);
+        self.reconstruct_deferred_digest.write(witness);
         self.is_complete.write(witness);
         self.is_first_shard.write(witness);
         self.vk_root.write(witness);
@@ -169,7 +174,6 @@ where
         let start_reconstruct_deferred_digest =
             self.start_reconstruct_deferred_digest.read(builder);
         let sp1_vk_digest = self.sp1_vk_digest.read(builder);
-        let leaf_challenger = self.leaf_challenger.read(builder);
         let committed_value_digest = self.committed_value_digest.read(builder);
         let deferred_proofs_digest = self.deferred_proofs_digest.read(builder);
         let end_pc = self.end_pc.read(builder);
@@ -184,7 +188,6 @@ where
             vk_merkle_data,
             start_reconstruct_deferred_digest,
             sp1_vk_digest,
-            leaf_challenger,
             committed_value_digest,
             deferred_proofs_digest,
             end_pc,
@@ -201,7 +204,6 @@ where
         self.vk_merkle_data.write(witness);
         self.start_reconstruct_deferred_digest.write(witness);
         self.sp1_vk_digest.write(witness);
-        self.leaf_challenger.write(witness);
         self.committed_value_digest.write(witness);
         self.deferred_proofs_digest.write(witness);
         self.end_pc.write(witness);
diff --git a/crates/recursion/circuit/src/machine/wrap.rs b/crates/recursion/circuit/src/machine/wrap.rs
index 0ec0d8db40..e3708e7e7f 100644
--- a/crates/recursion/circuit/src/machine/wrap.rs
+++ b/crates/recursion/circuit/src/machine/wrap.rs
@@ -5,7 +5,7 @@ use p3_baby_bear::BabyBear;
 use p3_commit::Mmcs;
 use p3_field::AbstractField;
 use p3_matrix::dense::RowMajorMatrix;
-use sp1_recursion_compiler::ir::{Builder, Ext, Felt};
+use sp1_recursion_compiler::ir::{Builder, Felt};
 use sp1_stark::{air::MachineAir, StarkMachine};
 
 use crate::{
@@ -62,24 +62,17 @@ where
         // Observe the vk and start pc.
         challenger.observe(builder, vk.commitment);
         challenger.observe(builder, vk.pc_start);
+        challenger.observe_slice(builder, vk.initial_global_cumulative_sum.0.x.0);
+        challenger.observe_slice(builder, vk.initial_global_cumulative_sum.0.y.0);
+        // Observe the padding.
         let zero: Felt<_> = builder.eval(C::F::zero());
-        for _ in 0..7 {
-            challenger.observe(builder, zero);
-        }
+        challenger.observe(builder, zero);
 
         // Observe the main commitment and public values.
         challenger
             .observe_slice(builder, proof.public_values[0..machine.num_pv_elts()].iter().copied());
 
-        let zero_ext: Ext<C::F, C::EF> = builder.eval(C::F::zero());
-        StarkVerifier::verify_shard(
-            builder,
-            &vk,
-            machine,
-            &mut challenger,
-            &proof,
-            &[zero_ext, zero_ext],
-        );
+        StarkVerifier::verify_shard(builder, &vk, machine, &mut challenger, &proof);
 
         // Get the public values, and assert that they are valid.
         let public_values: &RootPublicValues<Felt<C::F>> = proof.public_values.as_slice().borrow();
diff --git a/crates/recursion/circuit/src/merkle_tree.rs b/crates/recursion/circuit/src/merkle_tree.rs
index a3524c0567..fe769b53d2 100644
--- a/crates/recursion/circuit/src/merkle_tree.rs
+++ b/crates/recursion/circuit/src/merkle_tree.rs
@@ -18,8 +18,8 @@ use crate::{
 #[serde(bound(serialize = "HV::Digest: Serialize"))]
 #[serde(bound(deserialize = "HV::Digest: Deserialize<'de>"))]
 pub struct MerkleTree<F: Field, HV: FieldHasher<F>> {
-    /// The height of the tree, not counting the root layer. This is the same as the logarithm of the
-    /// number of leaves.
+    /// The height of the tree, not counting the root layer. This is the same as the logarithm of
+    /// the number of leaves.
     pub height: usize,
 
     /// All the layers but the root. If there are `n` leaves where `n` is a power of 2, there are
diff --git a/crates/recursion/circuit/src/stark.rs b/crates/recursion/circuit/src/stark.rs
index 8242ab23ab..4fc4f15930 100644
--- a/crates/recursion/circuit/src/stark.rs
+++ b/crates/recursion/circuit/src/stark.rs
@@ -8,18 +8,18 @@ use p3_baby_bear::BabyBear;
 use p3_commit::{Mmcs, Pcs, PolynomialSpace, TwoAdicMultiplicativeCoset};
 use p3_field::{AbstractField, ExtensionField, Field, TwoAdicField};
 use p3_matrix::{dense::RowMajorMatrix, Dimensions};
-
 use sp1_recursion_compiler::{
     circuit::CircuitV2Builder,
-    ir::{Builder, Config, Ext, ExtConst},
+    ir::{Builder, Config, DslIr, Ext, ExtConst},
     prelude::Felt,
 };
 use sp1_stark::{
-    air::InteractionScope, baby_bear_poseidon2::BabyBearPoseidon2, AirOpenedValues, Challenger,
-    Chip, ChipOpenedValues, InnerChallenge, ProofShape, ShardCommitment, ShardOpenedValues,
-    ShardProof, Val, PROOF_MAX_NUM_PVS,
+    air::{InteractionScope, MachineAir},
+    baby_bear_poseidon2::BabyBearPoseidon2,
+    AirOpenedValues, Challenger, Chip, ChipOpenedValues, InnerChallenge, ProofShape,
+    ShardCommitment, ShardOpenedValues, ShardProof, StarkGenericConfig, StarkMachine,
+    StarkVerifyingKey, Val, PROOF_MAX_NUM_PVS,
 };
-use sp1_stark::{air::MachineAir, StarkGenericConfig, StarkMachine, StarkVerifyingKey};
 
 use crate::{
     challenger::CanObserveVariable,
@@ -33,12 +33,14 @@ use crate::{
     domain::PolynomialSpaceVariable, fri::verify_two_adic_pcs, BabyBearFriConfigVariable,
     TwoAdicPcsRoundVariable, VerifyingKeyVariable,
 };
+use sp1_stark::septic_digest::SepticDigest;
 
 /// Reference: [sp1_core::stark::ShardProof]
+#[allow(clippy::type_complexity)]
 #[derive(Clone)]
 pub struct ShardProofVariable<C: CircuitConfig<F = SC::Val>, SC: BabyBearFriConfigVariable<C>> {
     pub commitment: ShardCommitment<SC::DigestVariable>,
-    pub opened_values: ShardOpenedValues<Ext<C::F, C::EF>>,
+    pub opened_values: ShardOpenedValues<Felt<C::F>, Ext<C::F, C::EF>>,
     pub opening_proof: TwoAdicPcsProofVariable<C, SC>,
     pub chip_ordering: HashMap<String, usize>,
     pub public_values: Vec<Felt<C::F>>,
@@ -59,8 +61,7 @@ pub fn dummy_vk_and_shard_proof<A: MachineAir<BabyBear>>(
 ) -> (StarkVerifyingKey<BabyBearPoseidon2>, ShardProof<BabyBearPoseidon2>) {
     // Make a dummy commitment.
     let commitment = ShardCommitment {
-        global_main_commit: dummy_hash(),
-        local_main_commit: dummy_hash(),
+        main_commit: dummy_hash(),
         permutation_commit: dummy_hash(),
         quotient_commit: dummy_hash(),
     };
@@ -73,8 +74,6 @@ pub fn dummy_vk_and_shard_proof<A: MachineAir<BabyBear>>(
         .map(|(i, (name, _))| (name.clone(), i))
         .collect::<HashMap<_, _>>();
     let shard_chips = machine.shard_chips_ordered(&chip_ordering).collect::<Vec<_>>();
-    let chip_scopes = shard_chips.iter().map(|chip| chip.commit_scope()).collect::<Vec<_>>();
-    let has_global_main_commit = chip_scopes.contains(&InteractionScope::Global);
     let opened_values = ShardOpenedValues {
         chips: shard_chips
             .iter()
@@ -87,14 +86,11 @@ pub fn dummy_vk_and_shard_proof<A: MachineAir<BabyBear>>(
 
     let mut preprocessed_names_and_dimensions = vec![];
     let mut preprocessed_batch_shape = vec![];
-    let mut global_main_batch_shape = vec![];
-    let mut local_main_batch_shape = vec![];
+    let mut main_batch_shape = vec![];
     let mut permutation_batch_shape = vec![];
     let mut quotient_batch_shape = vec![];
 
-    for ((chip, chip_opening), scope) in
-        shard_chips.iter().zip_eq(opened_values.chips.iter()).zip_eq(chip_scopes.iter())
-    {
+    for (chip, chip_opening) in shard_chips.iter().zip_eq(opened_values.chips.iter()) {
         if !chip_opening.preprocessed.local.is_empty() {
             let prep_shape = PolynomialShape {
                 width: chip_opening.preprocessed.local.len(),
@@ -111,10 +107,7 @@ pub fn dummy_vk_and_shard_proof<A: MachineAir<BabyBear>>(
             width: chip_opening.main.local.len(),
             log_degree: chip_opening.log_degree,
         };
-        match scope {
-            InteractionScope::Global => global_main_batch_shape.push(main_shape),
-            InteractionScope::Local => local_main_batch_shape.push(main_shape),
-        }
+        main_batch_shape.push(main_shape);
         let permutation_shape = PolynomialShape {
             width: chip_opening.permutation.local.len(),
             log_degree: chip_opening.log_degree,
@@ -129,22 +122,12 @@ pub fn dummy_vk_and_shard_proof<A: MachineAir<BabyBear>>(
         }
     }
 
-    let batch_shapes = if has_global_main_commit {
-        vec![
-            PolynomialBatchShape { shapes: preprocessed_batch_shape },
-            PolynomialBatchShape { shapes: global_main_batch_shape },
-            PolynomialBatchShape { shapes: local_main_batch_shape },
-            PolynomialBatchShape { shapes: permutation_batch_shape },
-            PolynomialBatchShape { shapes: quotient_batch_shape },
-        ]
-    } else {
-        vec![
-            PolynomialBatchShape { shapes: preprocessed_batch_shape },
-            PolynomialBatchShape { shapes: local_main_batch_shape },
-            PolynomialBatchShape { shapes: permutation_batch_shape },
-            PolynomialBatchShape { shapes: quotient_batch_shape },
-        ]
-    };
+    let batch_shapes = vec![
+        PolynomialBatchShape { shapes: preprocessed_batch_shape },
+        PolynomialBatchShape { shapes: main_batch_shape },
+        PolynomialBatchShape { shapes: permutation_batch_shape },
+        PolynomialBatchShape { shapes: quotient_batch_shape },
+    ];
 
     let fri_queries = machine.config().fri_config().num_queries;
     let log_blowup = machine.config().fri_config().log_blowup;
@@ -175,6 +158,7 @@ pub fn dummy_vk_and_shard_proof<A: MachineAir<BabyBear>>(
     let vk = StarkVerifyingKey {
         commit: dummy_hash(),
         pc_start: BabyBear::zero(),
+        initial_global_cumulative_sum: SepticDigest::<BabyBear>::zero(),
         chip_information: preprocessed_chip_information,
         chip_ordering: preprocessed_chip_ordering,
     };
@@ -188,7 +172,7 @@ pub fn dummy_vk_and_shard_proof<A: MachineAir<BabyBear>>(
 fn dummy_opened_values<F: Field, EF: ExtensionField<F>, A: MachineAir<F>>(
     chip: &Chip<F, A>,
     log_degree: usize,
-) -> ChipOpenedValues<EF> {
+) -> ChipOpenedValues<F, EF> {
     let preprocessed_width = chip.preprocessed_width();
     let preprocessed = AirOpenedValues {
         local: vec![EF::zero(); preprocessed_width],
@@ -211,7 +195,7 @@ fn dummy_opened_values<F: Field, EF: ExtensionField<F>, A: MachineAir<F>>(
         main,
         permutation,
         quotient,
-        global_cumulative_sum: EF::zero(),
+        global_cumulative_sum: SepticDigest::<F>::zero(),
         local_cumulative_sum: EF::zero(),
         log_degree,
     }
@@ -265,14 +249,10 @@ where
         machine: &StarkMachine<SC, A>,
         challenger: &mut SC::FriChallengerVariable,
         proof: &ShardProofVariable<C, SC>,
-        global_permutation_challenges: &[Ext<C::F, C::EF>],
     ) where
         A: for<'a> Air<RecursiveVerifierConstraintFolder<'a, C>>,
     {
         let chips = machine.shard_chips_ordered(&proof.chip_ordering).collect::<Vec<_>>();
-        let chip_scopes = chips.iter().map(|chip| chip.commit_scope()).collect::<Vec<_>>();
-
-        let has_global_main_commit = chip_scopes.contains(&InteractionScope::Global);
 
         let ShardProofVariable {
             commitment,
@@ -309,33 +289,28 @@ where
             .map(|log_degree| Self::natural_domain_for_degree(machine.config(), 1 << log_degree))
             .collect::<Vec<_>>();
 
-        let ShardCommitment {
-            global_main_commit,
-            local_main_commit,
-            permutation_commit,
-            quotient_commit,
-        } = *commitment;
+        let ShardCommitment { main_commit, permutation_commit, quotient_commit } = *commitment;
 
-        challenger.observe(builder, local_main_commit);
+        challenger.observe(builder, main_commit);
 
         let local_permutation_challenges =
             (0..2).map(|_| challenger.sample_ext(builder)).collect::<Vec<_>>();
 
         challenger.observe(builder, permutation_commit);
         for (opening, chip) in opened_values.chips.iter().zip_eq(chips.iter()) {
-            let global_sum = C::ext2felt(builder, opening.global_cumulative_sum);
             let local_sum = C::ext2felt(builder, opening.local_cumulative_sum);
-            challenger.observe_slice(builder, global_sum);
+            let global_sum = opening.global_cumulative_sum;
+
             challenger.observe_slice(builder, local_sum);
+            challenger.observe_slice(builder, global_sum.0.x.0);
+            challenger.observe_slice(builder, global_sum.0.y.0);
 
-            let has_global_interactions = chip
-                .sends()
-                .iter()
-                .chain(chip.receives())
-                .any(|i| i.scope == InteractionScope::Global);
-            if !has_global_interactions {
-                builder.assert_ext_eq(opening.global_cumulative_sum, C::EF::zero().cons());
+            if chip.commit_scope() == InteractionScope::Local {
+                let is_real: Felt<C::F> = builder.uninit();
+                builder.push_op(DslIr::ImmF(is_real, C::F::one()));
+                builder.assert_digest_zero_v2(is_real, global_sum);
             }
+
             let has_local_interactions = chip
                 .sends()
                 .iter()
@@ -433,33 +408,15 @@ where
             })
             .collect::<Vec<_>>();
 
-        // Split the main_domains_points_and_opens to the global and local chips.
-        let mut global_trace_points_and_openings = Vec::new();
-        let mut local_trace_points_and_openings = Vec::new();
-        for (i, points_and_openings) in
-            main_domains_points_and_opens.clone().into_iter().enumerate()
-        {
-            let scope = chip_scopes[i];
-            if scope == InteractionScope::Global {
-                global_trace_points_and_openings.push(points_and_openings);
-            } else {
-                local_trace_points_and_openings.push(points_and_openings);
-            }
-        }
-
         // Create the pcs rounds.
         let prep_commit = vk.commitment;
         let prep_round = TwoAdicPcsRoundVariable {
             batch_commit: prep_commit,
             domains_points_and_opens: preprocessed_domains_points_and_opens,
         };
-        let global_main_round = TwoAdicPcsRoundVariable {
-            batch_commit: global_main_commit,
-            domains_points_and_opens: global_trace_points_and_openings,
-        };
-        let local_main_round = TwoAdicPcsRoundVariable {
-            batch_commit: local_main_commit,
-            domains_points_and_opens: local_trace_points_and_openings,
+        let main_round = TwoAdicPcsRoundVariable {
+            batch_commit: main_commit,
+            domains_points_and_opens: main_domains_points_and_opens,
         };
         let perm_round = TwoAdicPcsRoundVariable {
             batch_commit: permutation_commit,
@@ -470,11 +427,7 @@ where
             domains_points_and_opens: quotient_domains_points_and_opens,
         };
 
-        let rounds = if has_global_main_commit {
-            vec![prep_round, global_main_round, local_main_round, perm_round, quotient_round]
-        } else {
-            vec![prep_round, local_main_round, perm_round, quotient_round]
-        };
+        let rounds = vec![prep_round, main_round, perm_round, quotient_round];
 
         // Verify the pcs proof
         builder.cycle_tracker_v2_enter("stage-d-verify-pcs".to_string());
@@ -484,11 +437,7 @@ where
 
         // Verify the constrtaint evaluations.
         builder.cycle_tracker_v2_enter("stage-e-verify-constraints".to_string());
-        let permutation_challenges = global_permutation_challenges
-            .iter()
-            .chain(local_permutation_challenges.iter())
-            .copied()
-            .collect::<Vec<_>>();
+        let permutation_challenges = local_permutation_challenges;
 
         for (chip, trace_domain, qc_domains, values) in
             izip!(chips.iter(), trace_domains, quotient_chunk_domains, opened_values.chips.iter(),)
@@ -544,8 +493,7 @@ impl<C: CircuitConfig<F = SC::Val>, SC: BabyBearFriConfigVariable<C>> ShardProof
 #[allow(unused_imports)]
 #[cfg(any(test, feature = "export-tests"))]
 pub mod tests {
-    use std::collections::VecDeque;
-    use std::fmt::Debug;
+    use std::{collections::VecDeque, fmt::Debug};
 
     use crate::{
         challenger::{CanCopyChallenger, CanObserveVariable, DuplexChallengerVariable},
@@ -557,13 +505,14 @@ pub mod tests {
     use sp1_core_machine::{
         io::SP1Stdin,
         riscv::RiscvAir,
-        utils::{prove, setup_logger},
+        utils::{prove_core, prove_core_stream, setup_logger},
     };
     use sp1_recursion_compiler::{
         config::{InnerConfig, OuterConfig},
         ir::{Builder, DslIr, TracedVec},
     };
 
+    use sp1_core_executor::SP1Context;
     use sp1_recursion_core::{air::Block, machine::RecursionAir, stark::BabyBearPoseidon2Outer};
     use sp1_stark::{
         baby_bear_poseidon2::BabyBearPoseidon2, CpuProver, InnerVal, MachineProver, SP1CoreOpts,
@@ -590,20 +539,27 @@ pub mod tests {
     ) -> (TracedVec<DslIr<C>>, Vec<Block<BabyBear>>) {
         setup_logger();
 
+        let program = Program::from(elf).unwrap();
         let machine = RiscvAir::<C::F>::machine(SC::default());
-        let (_, vk) = machine.setup(&Program::from(elf).unwrap());
-        let (proof, _, _) = prove::<_, CoreP>(
-            Program::from(elf).unwrap(),
+        let prover = CoreP::new(machine);
+        let (pk, vk) = prover.setup(&program);
+
+        let (proof, _, _) = prove_core::<_, CoreP>(
+            &prover,
+            &pk,
+            &vk,
+            program,
             &SP1Stdin::new(),
-            SC::default(),
             opts,
+            SP1Context::default(),
             None,
         )
         .unwrap();
+
+        let machine = RiscvAir::<C::F>::machine(SC::default());
         let mut challenger = machine.config().challenger();
         machine.verify(&vk, &proof, &mut challenger).unwrap();
 
-        // Observe all the commitments.
         let mut builder = Builder::<C>::default();
 
         let mut witness_stream = Vec::<WitnessBlock<C>>::new();
@@ -625,29 +581,14 @@ pub mod tests {
                 dummy_proof.read(&mut builder)
             })
             .collect::<Vec<_>>();
-        // Observe all the commitments, and put the proofs into the witness stream.
-        for proof in proofs.iter() {
-            let ShardCommitment { global_main_commit, .. } = proof.commitment;
-            challenger.observe(&mut builder, global_main_commit);
-            let pv_slice = &proof.public_values[..machine.num_pv_elts()];
-            challenger.observe_slice(&mut builder, pv_slice.iter().cloned());
-        }
-
-        let global_permutation_challenges =
-            (0..2).map(|_| challenger.sample_ext(&mut builder)).collect::<Vec<_>>();
 
         // Verify the first proof.
         let num_shards = num_shards_in_batch.unwrap_or(proofs.len());
         for proof in proofs.into_iter().take(num_shards) {
             let mut challenger = challenger.copy(&mut builder);
-            StarkVerifier::verify_shard(
-                &mut builder,
-                &vk,
-                &machine,
-                &mut challenger,
-                &proof,
-                &global_permutation_challenges,
-            );
+            let pv_slice = &proof.public_values[..machine.num_pv_elts()];
+            challenger.observe_slice(&mut builder, pv_slice.iter().cloned());
+            StarkVerifier::verify_shard(&mut builder, &vk, &machine, &mut challenger, &proof);
         }
         (builder.into_operations(), witness_stream)
     }
diff --git a/crates/recursion/circuit/src/types.rs b/crates/recursion/circuit/src/types.rs
index 16eab3bc92..fdb9afa5be 100644
--- a/crates/recursion/circuit/src/types.rs
+++ b/crates/recursion/circuit/src/types.rs
@@ -2,6 +2,7 @@ use hashbrown::HashMap;
 use p3_commit::TwoAdicMultiplicativeCoset;
 use p3_field::{AbstractField, TwoAdicField};
 use p3_matrix::Dimensions;
+use sp1_stark::septic_digest::SepticDigest;
 
 use sp1_recursion_compiler::ir::{Builder, Ext, Felt};
 
@@ -17,6 +18,7 @@ use crate::{
 pub struct VerifyingKeyVariable<C: CircuitConfig<F = SC::Val>, SC: BabyBearFriConfigVariable<C>> {
     pub commitment: SC::DigestVariable,
     pub pc_start: Felt<C::F>,
+    pub initial_global_cumulative_sum: SepticDigest<Felt<C::F>>,
     pub chip_information: Vec<(String, TwoAdicMultiplicativeCoset<C::F>, Dimensions)>,
     pub chip_ordering: HashMap<String, usize>,
 }
@@ -83,11 +85,12 @@ impl<C: CircuitConfig<F = SC::Val>, SC: BabyBearFriConfigVariable<C>> VerifyingK
         challenger.observe(builder, self.commitment);
         // Observe the pc_start.
         challenger.observe(builder, self.pc_start);
+        // Observe the initial global cumulative sum.
+        challenger.observe_slice(builder, self.initial_global_cumulative_sum.0.x.0);
+        challenger.observe_slice(builder, self.initial_global_cumulative_sum.0.y.0);
         // Observe the padding.
         let zero: Felt<_> = builder.eval(C::F::zero());
-        for _ in 0..7 {
-            challenger.observe(builder, zero);
-        }
+        challenger.observe(builder, zero);
     }
 
     /// Hash the verifying key + prep domains into a single digest.
diff --git a/crates/recursion/circuit/src/utils.rs b/crates/recursion/circuit/src/utils.rs
index 9c7eeaf8f0..871c21ee09 100644
--- a/crates/recursion/circuit/src/utils.rs
+++ b/crates/recursion/circuit/src/utils.rs
@@ -1,20 +1,12 @@
-use std::mem::MaybeUninit;
-
 use p3_baby_bear::BabyBear;
 use p3_bn254_fr::Bn254Fr;
 use p3_field::{AbstractField, PrimeField32};
 
 use sp1_recursion_compiler::ir::{Builder, Config, Felt, Var};
-use sp1_recursion_core::{air::ChallengerPublicValues, DIGEST_SIZE};
+use sp1_recursion_core::DIGEST_SIZE;
 
 use sp1_stark::Word;
 
-pub(crate) unsafe fn uninit_challenger_pv<C: Config>(
-    _builder: &mut Builder<C>,
-) -> ChallengerPublicValues<Felt<C::F>> {
-    unsafe { MaybeUninit::zeroed().assume_init() }
-}
-
 /// Convert 8 BabyBear words into a Bn254Fr field element by shifting by 31 bits each time. The last
 /// word becomes the least significant bits.
 #[allow(dead_code)]
@@ -101,7 +93,10 @@ pub(crate) mod tests {
     use std::sync::Arc;
 
     use sp1_core_machine::utils::{run_test_machine_with_prover, setup_logger};
-    use sp1_recursion_compiler::{circuit::AsmCompiler, circuit::AsmConfig, ir::DslIr};
+    use sp1_recursion_compiler::{
+        circuit::{AsmCompiler, AsmConfig},
+        ir::DslIr,
+    };
 
     use sp1_recursion_compiler::ir::TracedVec;
     use sp1_recursion_core::{machine::RecursionAir, Runtime};
diff --git a/crates/recursion/circuit/src/witness/mod.rs b/crates/recursion/circuit/src/witness/mod.rs
index 05b23de603..cdf87e4467 100644
--- a/crates/recursion/circuit/src/witness/mod.rs
+++ b/crates/recursion/circuit/src/witness/mod.rs
@@ -5,6 +5,7 @@ use sp1_recursion_compiler::ir::{Builder, Ext, Felt};
 
 pub use outer::*;
 use sp1_stark::{
+    septic_curve::SepticCurve, septic_digest::SepticDigest, septic_extension::SepticExtension,
     ChipOpenedValues, Com, InnerChallenge, InnerVal, OpeningProof, ShardCommitment,
     ShardOpenedValues, ShardProof,
 };
@@ -165,30 +166,23 @@ impl<C: CircuitConfig, T: Witnessable<C>> Witnessable<C> for ShardCommitment<T>
     type WitnessVariable = ShardCommitment<T::WitnessVariable>;
 
     fn read(&self, builder: &mut Builder<C>) -> Self::WitnessVariable {
-        let global_main_commit = self.global_main_commit.read(builder);
-        let local_main_commit = self.local_main_commit.read(builder);
+        let main_commit = self.main_commit.read(builder);
         let permutation_commit = self.permutation_commit.read(builder);
         let quotient_commit = self.quotient_commit.read(builder);
-        Self::WitnessVariable {
-            global_main_commit,
-            local_main_commit,
-            permutation_commit,
-            quotient_commit,
-        }
+        Self::WitnessVariable { main_commit, permutation_commit, quotient_commit }
     }
 
     fn write(&self, witness: &mut impl WitnessWriter<C>) {
-        self.global_main_commit.write(witness);
-        self.local_main_commit.write(witness);
+        self.main_commit.write(witness);
         self.permutation_commit.write(witness);
         self.quotient_commit.write(witness);
     }
 }
 
 impl<C: CircuitConfig<F = InnerVal, EF = InnerChallenge>> Witnessable<C>
-    for ShardOpenedValues<InnerChallenge>
+    for ShardOpenedValues<InnerVal, InnerChallenge>
 {
-    type WitnessVariable = ShardOpenedValues<Ext<C::F, C::EF>>;
+    type WitnessVariable = ShardOpenedValues<Felt<C::F>, Ext<C::F, C::EF>>;
 
     fn read(&self, builder: &mut Builder<C>) -> Self::WitnessVariable {
         let chips = self.chips.read(builder);
@@ -201,9 +195,26 @@ impl<C: CircuitConfig<F = InnerVal, EF = InnerChallenge>> Witnessable<C>
 }
 
 impl<C: CircuitConfig<F = InnerVal, EF = InnerChallenge>> Witnessable<C>
-    for ChipOpenedValues<InnerChallenge>
+    for SepticDigest<InnerVal>
+{
+    type WitnessVariable = SepticDigest<Felt<C::F>>;
+
+    fn read(&self, builder: &mut Builder<C>) -> Self::WitnessVariable {
+        let x = self.0.x.0.read(builder);
+        let y = self.0.y.0.read(builder);
+        SepticDigest(SepticCurve { x: SepticExtension(x), y: SepticExtension(y) })
+    }
+
+    fn write(&self, witness: &mut impl WitnessWriter<C>) {
+        self.0.x.0.write(witness);
+        self.0.y.0.write(witness);
+    }
+}
+
+impl<C: CircuitConfig<F = InnerVal, EF = InnerChallenge>> Witnessable<C>
+    for ChipOpenedValues<InnerVal, InnerChallenge>
 {
-    type WitnessVariable = ChipOpenedValues<Ext<C::F, C::EF>>;
+    type WitnessVariable = ChipOpenedValues<Felt<C::F>, Ext<C::F, C::EF>>;
 
     fn read(&self, builder: &mut Builder<C>) -> Self::WitnessVariable {
         let preprocessed = self.preprocessed.read(builder);
diff --git a/crates/recursion/compiler/Cargo.toml b/crates/recursion/compiler/Cargo.toml
index 9be2287309..61ce7d9229 100644
--- a/crates/recursion/compiler/Cargo.toml
+++ b/crates/recursion/compiler/Cargo.toml
@@ -33,3 +33,7 @@ p3-dft = { workspace = true }
 p3-merkle-tree = { workspace = true }
 rand = "0.8.5"
 criterion = { version = "0.5.1", features = ["html_reports"] }
+
+[features]
+default = ["debug"]
+debug = []
\ No newline at end of file
diff --git a/crates/recursion/compiler/src/circuit/builder.rs b/crates/recursion/compiler/src/circuit/builder.rs
index f78145ad1d..a08567678a 100644
--- a/crates/recursion/compiler/src/circuit/builder.rs
+++ b/crates/recursion/compiler/src/circuit/builder.rs
@@ -2,12 +2,15 @@
 
 use std::iter::repeat;
 
+use crate::prelude::*;
+use itertools::Itertools;
 use p3_baby_bear::BabyBear;
 use p3_field::{AbstractExtensionField, AbstractField};
 use sp1_recursion_core::air::RecursionPublicValues;
-
-use crate::prelude::*;
 use sp1_recursion_core::{chips::poseidon2_skinny::WIDTH, D, DIGEST_SIZE, HASH_RATE};
+use sp1_stark::septic_curve::SepticCurve;
+use sp1_stark::septic_digest::SepticDigest;
+use sp1_stark::septic_extension::SepticExtension;
 
 pub trait CircuitV2Builder<C: Config> {
     fn bits2num_v2_f(
@@ -31,6 +34,19 @@ pub trait CircuitV2Builder<C: Config> {
     ) -> [Felt<C::F>; DIGEST_SIZE];
     fn fri_fold_v2(&mut self, input: CircuitV2FriFoldInput<C>) -> CircuitV2FriFoldOutput<C>;
     fn ext2felt_v2(&mut self, ext: Ext<C::F, C::EF>) -> [Felt<C::F>; D];
+    fn add_curve_v2(
+        &mut self,
+        point1: SepticCurve<Felt<C::F>>,
+        point2: SepticCurve<Felt<C::F>>,
+    ) -> SepticCurve<Felt<C::F>>;
+    fn assert_digest_zero_v2(&mut self, is_real: Felt<C::F>, digest: SepticDigest<Felt<C::F>>);
+    fn sum_digest_v2(&mut self, digests: Vec<SepticDigest<Felt<C::F>>>)
+        -> SepticDigest<Felt<C::F>>;
+    fn select_global_cumulative_sum(
+        &mut self,
+        is_first_shard: Felt<C::F>,
+        vk_digest: SepticDigest<Felt<C::F>>,
+    ) -> SepticDigest<Felt<C::F>>;
     fn commit_public_values_v2(&mut self, public_values: RecursionPublicValues<Felt<C::F>>);
     fn cycle_tracker_v2_enter(&mut self, name: String);
     fn cycle_tracker_v2_exit(&mut self);
@@ -188,6 +204,103 @@ impl<C: Config<F = BabyBear>> CircuitV2Builder<C> for Builder<C> {
         felts
     }
 
+    /// Adds two septic elliptic curve points.
+    fn add_curve_v2(
+        &mut self,
+        point1: SepticCurve<Felt<C::F>>,
+        point2: SepticCurve<Felt<C::F>>,
+    ) -> SepticCurve<Felt<C::F>> {
+        let point_sum_x: [Felt<C::F>; 7] = core::array::from_fn(|_| self.uninit());
+        let point_sum_y: [Felt<C::F>; 7] = core::array::from_fn(|_| self.uninit());
+        let point =
+            SepticCurve { x: SepticExtension(point_sum_x), y: SepticExtension(point_sum_y) };
+        self.push_op(DslIr::CircuitV2HintAddCurve(Box::new((point, point1, point2))));
+
+        let point1_symbolic = SepticCurve::convert(point1, |x| x.into());
+        let point2_symbolic = SepticCurve::convert(point2, |x| x.into());
+        let point_symbolic = SepticCurve::convert(point, |x| x.into());
+
+        let sum_checker_x = SepticCurve::<SymbolicFelt<C::F>>::sum_checker_x(
+            point1_symbolic,
+            point2_symbolic,
+            point_symbolic,
+        );
+
+        let sum_checker_y = SepticCurve::<SymbolicFelt<C::F>>::sum_checker_y(
+            point1_symbolic,
+            point2_symbolic,
+            point_symbolic,
+        );
+
+        for limb in sum_checker_x.0 {
+            self.assert_felt_eq(limb, C::F::zero());
+        }
+
+        for limb in sum_checker_y.0 {
+            self.assert_felt_eq(limb, C::F::zero());
+        }
+
+        point
+    }
+
+    /// Asserts that the SepticDigest is zero.
+    fn assert_digest_zero_v2(&mut self, is_real: Felt<C::F>, digest: SepticDigest<Felt<C::F>>) {
+        let zero = SepticDigest::<SymbolicFelt<C::F>>::zero();
+        for (digest_limb_x, zero_limb_x) in digest.0.x.0.into_iter().zip_eq(zero.0.x.0.into_iter())
+        {
+            self.assert_felt_eq(is_real * digest_limb_x, is_real * zero_limb_x);
+        }
+        for (digest_limb_y, zero_limb_y) in digest.0.y.0.into_iter().zip_eq(zero.0.y.0.into_iter())
+        {
+            self.assert_felt_eq(is_real * digest_limb_y, is_real * zero_limb_y);
+        }
+    }
+
+    /// Returns the zero digest when `is_first_shard` is zero, and returns the `digest` when `is_first_shard` is one.
+    fn select_global_cumulative_sum(
+        &mut self,
+        is_first_shard: Felt<C::F>,
+        vk_digest: SepticDigest<Felt<C::F>>,
+    ) -> SepticDigest<Felt<C::F>> {
+        let zero = SepticDigest::<SymbolicFelt<C::F>>::zero();
+        let one: Felt<C::F> = self.constant(C::F::one());
+        let x = SepticExtension(core::array::from_fn(|i| {
+            self.eval(is_first_shard * vk_digest.0.x.0[i] + (one - is_first_shard) * zero.0.x.0[i])
+        }));
+        let y = SepticExtension(core::array::from_fn(|i| {
+            self.eval(is_first_shard * vk_digest.0.y.0[i] + (one - is_first_shard) * zero.0.y.0[i])
+        }));
+        SepticDigest(SepticCurve { x, y })
+    }
+
+    // Sums the digests into one.
+    fn sum_digest_v2(
+        &mut self,
+        digests: Vec<SepticDigest<Felt<C::F>>>,
+    ) -> SepticDigest<Felt<C::F>> {
+        let mut convert_to_felt =
+            |point: SepticCurve<C::F>| SepticCurve::convert(point, |value| self.eval(value));
+
+        let start = convert_to_felt(SepticDigest::starting_digest().0);
+        let zero_digest = convert_to_felt(SepticDigest::zero().0);
+
+        if digests.is_empty() {
+            return SepticDigest(zero_digest);
+        }
+
+        let neg_start = convert_to_felt(SepticDigest::starting_digest().0.neg());
+        let neg_zero_digest = convert_to_felt(SepticDigest::zero().0.neg());
+
+        let mut ret = start;
+        for (i, digest) in digests.clone().into_iter().enumerate() {
+            ret = self.add_curve_v2(ret, digest.0);
+            if i != digests.len() - 1 {
+                ret = self.add_curve_v2(ret, neg_zero_digest)
+            }
+        }
+        SepticDigest(self.add_curve_v2(ret, neg_start))
+    }
+
     // Commits public values.
     fn commit_public_values_v2(&mut self, public_values: RecursionPublicValues<Felt<C::F>>) {
         self.push_op(DslIr::CircuitV2CommitPublicValues(Box::new(public_values)));
diff --git a/crates/recursion/compiler/src/circuit/compiler.rs b/crates/recursion/compiler/src/circuit/compiler.rs
index 14aa320c98..b7a610c570 100644
--- a/crates/recursion/compiler/src/circuit/compiler.rs
+++ b/crates/recursion/compiler/src/circuit/compiler.rs
@@ -1,6 +1,8 @@
 use chips::poseidon2_skinny::WIDTH;
 use core::fmt::Debug;
-use instruction::{FieldEltType, HintBitsInstr, HintExt2FeltsInstr, HintInstr, PrintInstr};
+use instruction::{
+    FieldEltType, HintAddCurveInstr, HintBitsInstr, HintExt2FeltsInstr, HintInstr, PrintInstr,
+};
 use itertools::Itertools;
 use p3_field::{
     AbstractExtensionField, AbstractField, Field, PrimeField, PrimeField64, TwoAdicField,
@@ -10,6 +12,7 @@ use sp1_recursion_core::{
     air::{Block, RecursionPublicValues, RECURSIVE_PROOF_NUM_PV_ELTS},
     BaseAluInstr, BaseAluOpcode,
 };
+use sp1_stark::septic_curve::SepticCurve;
 use std::{borrow::Borrow, collections::HashMap, iter::repeat, mem::transmute};
 use vec_map::VecMap;
 
@@ -245,6 +248,7 @@ where
         f(self.ext_alu(DivE, out, Imm::EF(C::EF::one()), diff));
     }
 
+    #[inline(always)]
     fn poseidon2_permute(
         &mut self,
         dst: [impl Reg<C>; WIDTH],
@@ -259,6 +263,7 @@ where
         }))
     }
 
+    #[inline(always)]
     fn select(
         &mut self,
         bit: impl Reg<C>,
@@ -307,6 +312,32 @@ where
         })
     }
 
+    fn add_curve(
+        &mut self,
+        output: SepticCurve<Felt<C::F>>,
+        input1: SepticCurve<Felt<C::F>>,
+        input2: SepticCurve<Felt<C::F>>,
+    ) -> Instruction<C::F> {
+        Instruction::HintAddCurve(Box::new(HintAddCurveInstr {
+            output_x_addrs_mults: output
+                .x
+                .0
+                .into_iter()
+                .map(|r| (r.write(self), C::F::zero()))
+                .collect(),
+            output_y_addrs_mults: output
+                .y
+                .0
+                .into_iter()
+                .map(|r| (r.write(self), C::F::zero()))
+                .collect(),
+            input1_x_addrs: input1.x.0.into_iter().map(|value| value.read_ghost(self)).collect(),
+            input1_y_addrs: input1.y.0.into_iter().map(|value| value.read_ghost(self)).collect(),
+            input2_x_addrs: input2.x.0.into_iter().map(|value| value.read_ghost(self)).collect(),
+            input2_y_addrs: input2.y.0.into_iter().map(|value| value.read_ghost(self)).collect(),
+        }))
+    }
+
     fn fri_fold(
         &mut self,
         CircuitV2FriFoldOutput { alpha_pow_output, ro_output }: CircuitV2FriFoldOutput<C>,
@@ -411,6 +442,7 @@ where
     ///
     /// We do not simply return a `Vec` for performance reasons --- results would be immediately fed
     /// to `flat_map`, so we employ fusion/deforestation to eliminate intermediate data structures.
+    #[inline]
     pub fn compile_one<F>(
         &mut self,
         ir_instr: DslIr<C>,
@@ -507,6 +539,7 @@ where
             DslIr::CircuitV2CommitPublicValues(public_values) => {
                 f(self.commit_public_values(&public_values))
             }
+            DslIr::CircuitV2HintAddCurve(data) => f(self.add_curve(data.0, data.1, data.2)),
 
             DslIr::PrintV(dst) => f(self.print_f(dst)),
             DslIr::PrintF(dst) => f(self.print_f(dst)),
@@ -545,6 +578,7 @@ where
                         Ok(instr) => {
                             span_builder.item(instr_name(&instr));
                             instrs.push(instr);
+                            #[cfg(feature = "debug")]
                             traces.push(trace.clone());
                         }
                         Err(CompileOneErr::CycleTrackerEnter(name)) => {
@@ -655,6 +689,17 @@ where
                             .iter_mut()
                             .for_each(|(addr, mult)| backfill((mult, addr)));
                     }
+                    Instruction::HintAddCurve(instr) => {
+                        let HintAddCurveInstr {
+                            output_x_addrs_mults, output_y_addrs_mults, ..
+                        } = instr.as_mut();
+                        output_x_addrs_mults
+                            .iter_mut()
+                            .for_each(|(addr, mult)| backfill((mult, addr)));
+                        output_y_addrs_mults
+                            .iter_mut()
+                            .for_each(|(addr, mult)| backfill((mult, addr)));
+                    }
                     // Instructions that do not write to memory.
                     Instruction::Mem(MemInstr { kind: MemAccessKind::Read, .. })
                     | Instruction::CommitPublicValues(_)
@@ -707,6 +752,7 @@ const fn instr_name<F>(instr: &Instruction<F>) -> &'static str {
         Instruction::Print(_) => "Print",
         Instruction::HintExt2Felts(_) => "HintExt2Felts",
         Instruction::Hint(_) => "Hint",
+        Instruction::HintAddCurve(_) => "HintAddCurve",
         Instruction::CommitPublicValues(_) => "CommitPublicValues",
     }
 }
diff --git a/crates/recursion/compiler/src/ir/builder.rs b/crates/recursion/compiler/src/ir/builder.rs
index 09ba4789bb..c11d25b142 100644
--- a/crates/recursion/compiler/src/ir/builder.rs
+++ b/crates/recursion/compiler/src/ir/builder.rs
@@ -2,7 +2,6 @@ use std::{cell::UnsafeCell, iter::Zip, ptr, vec::IntoIter};
 
 use backtrace::Backtrace;
 use p3_field::AbstractField;
-use sp1_core_machine::utils::sp1_debug_mode;
 use sp1_primitives::types::RecursionProgramType;
 
 use super::{
@@ -33,22 +32,27 @@ impl<T> From<Vec<T>> for TracedVec<T> {
 }
 
 impl<T> TracedVec<T> {
-    pub const fn new() -> Self {
-        Self { vec: Vec::new(), traces: Vec::new() }
+    pub fn new() -> Self {
+        Self { vec: Vec::with_capacity(10_000_000), traces: Vec::new() }
     }
 
+    #[inline(always)]
     pub fn push(&mut self, value: T) {
         self.vec.push(value);
-        self.traces.push(None);
+
+        #[cfg(feature = "debug")]
+        {
+            self.traces.push(None);
+        }
     }
 
     /// Pushes a value to the vector and records a backtrace if SP1_DEBUG is enabled
     pub fn trace_push(&mut self, value: T) {
         self.vec.push(value);
-        if sp1_debug_mode() {
+
+        #[cfg(feature = "debug")]
+        {
             self.traces.push(Some(Backtrace::new_unresolved()));
-        } else {
-            self.traces.push(None);
         }
     }
 
@@ -73,7 +77,12 @@ impl<T> IntoIterator for TracedVec<T> {
     type IntoIter = Zip<IntoIter<T>, IntoIter<Option<Backtrace>>>;
 
     fn into_iter(self) -> Self::IntoIter {
-        self.vec.into_iter().zip(self.traces)
+        let vec_len = self.vec.len();
+        let mut traces = self.traces;
+        if traces.len() < vec_len {
+            traces.extend(std::iter::repeat(None).take(vec_len - traces.len()));
+        }
+        self.vec.into_iter().zip(traces)
     }
 }
 
@@ -162,6 +171,7 @@ impl<C: Config> Builder<C> {
     }
 
     /// Pushes an operation to the builder.
+    #[inline(always)]
     pub fn push_op(&mut self, op: DslIr<C>) {
         self.inner.get_mut().operations.push(op);
     }
@@ -684,8 +694,8 @@ impl<'a, C: Config> IfBuilder<'a, C> {
         //         let lhs: Var<C::N> = self.builder.eval(lhs);
         //         IfCondition::NeI(lhs, rhs)
         //     }
-        //     (SymbolicVar::Val(lhs, _), SymbolicVar::Val(rhs, _), true) => IfCondition::Eq(lhs, rhs),
-        //     (SymbolicVar::Val(lhs, _), SymbolicVar::Val(rhs, _), false) => {
+        //     (SymbolicVar::Val(lhs, _), SymbolicVar::Val(rhs, _), true) => IfCondition::Eq(lhs,
+        // rhs),     (SymbolicVar::Val(lhs, _), SymbolicVar::Val(rhs, _), false) => {
         //         IfCondition::Ne(lhs, rhs)
         //     }
         //     (SymbolicVar::Val(lhs, _), rhs, true) => {
diff --git a/crates/recursion/compiler/src/ir/instructions.rs b/crates/recursion/compiler/src/ir/instructions.rs
index 04ddb26713..06f811ccf4 100644
--- a/crates/recursion/compiler/src/ir/instructions.rs
+++ b/crates/recursion/compiler/src/ir/instructions.rs
@@ -1,4 +1,7 @@
+#![deny(clippy::large_enum_variant)]
+
 use sp1_recursion_core::air::RecursionPublicValues;
+use sp1_stark::septic_curve::SepticCurve;
 
 use super::{
     Array, CircuitV2FriFoldInput, CircuitV2FriFoldOutput, Config, Ext, Felt, FriFoldInput,
@@ -120,7 +123,8 @@ pub enum DslIr<C: Config> {
     /// Inverts an extension field element (ext = 1 / ext).
     InvE(Ext<C::F, C::EF>, Ext<C::F, C::EF>),
 
-    /// Selects order of felts based on a bit (should_swap, first result, second result, first input, second input)
+    /// Selects order of felts based on a bit (should_swap, first result, second result, first
+    /// input, second input)
     Select(Felt<C::F>, Felt<C::F>, Felt<C::F>, Felt<C::F>, Felt<C::F>),
 
     // Control flow.
@@ -274,6 +278,11 @@ pub enum DslIr<C: Config> {
     /// Should only be used when target is a gnark circuit.
     CircuitCommitCommittedValuesDigest(Var<C::N>),
 
+    /// Adds two elliptic curve points. (sum, point_1, point_2).
+    CircuitV2HintAddCurve(
+        Box<(SepticCurve<Felt<C::F>>, SepticCurve<Felt<C::F>>, SepticCurve<Felt<C::F>>)>,
+    ),
+
     // FRI specific instructions.
     /// Executes a FRI fold operation. 1st field is the size of the fri fold input array.  2nd
     /// field is the fri fold input array.  See [`FriFoldInput`] for more details.
@@ -283,7 +292,8 @@ pub enum DslIr<C: Config> {
     /// more details.
     CircuitV2FriFold(Box<(CircuitV2FriFoldOutput<C>, CircuitV2FriFoldInput<C>)>),
     // FRI specific instructions.
-    /// Executes a Batch FRI loop. Input is the power of alphas, evaluations at z, and evaluations at x.
+    /// Executes a Batch FRI loop. Input is the power of alphas, evaluations at z, and evaluations
+    /// at x.
     CircuitV2BatchFRI(
         Box<(Ext<C::F, C::EF>, Vec<Ext<C::F, C::EF>>, Vec<Ext<C::F, C::EF>>, Vec<Felt<C::F>>)>,
     ),
diff --git a/crates/recursion/compiler/src/ir/symbolic.rs b/crates/recursion/compiler/src/ir/symbolic.rs
index 40ed4c4bf6..5c114923f2 100644
--- a/crates/recursion/compiler/src/ir/symbolic.rs
+++ b/crates/recursion/compiler/src/ir/symbolic.rs
@@ -1262,7 +1262,7 @@ impl<F: Field, EF: ExtensionField<F>, E: Any> ExtensionOperand<F, EF> for E {
                 let value_ref = unsafe { mem::transmute::<&E, &ExtOperand<F, EF>>(&self) };
                 value_ref.clone()
             }
-            _ => unimplemented!("unsupported type"),
+            _ => unimplemented!("unsupported type {:?}", self.type_id()),
         }
     }
 }
diff --git a/crates/recursion/core/Cargo.toml b/crates/recursion/core/Cargo.toml
index 6000e84c54..7d3c6682e0 100644
--- a/crates/recursion/core/Cargo.toml
+++ b/crates/recursion/core/Cargo.toml
@@ -8,6 +8,7 @@ license = { workspace = true }
 repository = { workspace = true }
 keywords = { workspace = true }
 categories = { workspace = true }
+links = "sp1-recursion-core-sys"
 
 [dependencies]
 p3-field = { workspace = true }
@@ -38,6 +39,19 @@ backtrace = { version = "0.3.71", features = ["serde"] }
 static_assertions = "1.1.0"
 thiserror = "1.0.60"
 vec_map = "0.8.2"
+num_cpus = "1.16.0"
 
 [dev-dependencies]
 rand = "0.8.5"
+
+[build-dependencies]
+sp1-stark = { workspace = true }
+sp1-primitives = { workspace = true }
+p3-baby-bear = { workspace = true }
+cbindgen = "0.27.0"
+cc = "1.1"
+pathdiff = "0.2.1"
+glob = "0.3.1"
+
+[features]
+sys = ["sp1-core-machine/sys"]
\ No newline at end of file
diff --git a/crates/recursion/core/build.rs b/crates/recursion/core/build.rs
new file mode 100644
index 0000000000..b188556047
--- /dev/null
+++ b/crates/recursion/core/build.rs
@@ -0,0 +1,200 @@
+fn main() {
+    #[cfg(feature = "sys")]
+    sys::build_ffi();
+}
+
+#[cfg(feature = "sys")]
+mod sys {
+    use std::{
+        env, fs, os,
+        path::{Path, PathBuf},
+    };
+
+    use pathdiff::diff_paths;
+
+    /// The library name, used for the static library archive and the headers.
+    /// Should be chosen as to not conflict with other library/header names.
+    const LIB_NAME: &str = "sp1-recursion-core-sys";
+
+    /// The name of all include directories involved, used to find and output header files.
+    const INCLUDE_DIRNAME: &str = "include";
+
+    /// The name of the directory to recursively search for source files in.
+    const SOURCE_DIRNAME: &str = "cpp";
+
+    /// The warning placed in the cbindgen header.
+    const AUTOGEN_WARNING: &str =
+        "/* Automatically generated by `cbindgen`. Not intended for manual editing. */";
+
+    pub fn build_ffi() {
+        // The name of the header generated by `cbindgen`.
+        let cbindgen_hpp = &format!("{LIB_NAME}-cbindgen.hpp");
+
+        // The crate directory.
+        let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
+
+        // The output directory, where built artifacts should be placed.
+        let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+
+        // The target directory that the cargo invocation is using.
+        // Headers are symlinked into `target/include` purely for IDE purposes.
+        let target_dir = {
+            let mut dir = out_dir.clone();
+            loop {
+                if dir.ends_with("target") {
+                    break dir;
+                }
+                if !dir.pop() {
+                    panic!("OUT_DIR does not have parent called \"target\": {:?}", out_dir);
+                }
+            }
+        };
+
+        // The directory to read headers from.
+        let source_include_dir = crate_dir.join(INCLUDE_DIRNAME);
+
+        // The directory to place headers into.
+        let target_include_dir = out_dir.join(INCLUDE_DIRNAME);
+
+        // The directory to place symlinks to headers into. Has the fixed path "target/include".
+        let target_include_dir_fixed = target_dir.join(INCLUDE_DIRNAME);
+
+        // The directory to read source files from.
+        let source_dir = crate_dir.join(SOURCE_DIRNAME);
+
+        let headers = glob::glob(source_include_dir.join("**/*.hpp").to_str().unwrap())
+            .unwrap()
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        let compilation_units = glob::glob(source_dir.join("**/*.cpp").to_str().unwrap())
+            .unwrap()
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        // Tell Cargo that if the given file changes, to rerun this build script.
+        println!("cargo::rerun-if-changed={INCLUDE_DIRNAME}");
+        println!("cargo::rerun-if-changed={SOURCE_DIRNAME}");
+        println!("cargo::rerun-if-changed=src");
+        println!("cargo::rerun-if-changed=Cargo.toml");
+
+        // Cargo build script metadata, used by dependents' build scripts.
+        // The root directory containing the library archive.
+        println!("cargo::metadata=root={}", out_dir.to_str().unwrap());
+
+        // The include path defining the library's API.
+        println!("cargo::metadata=include={}", target_include_dir.to_str().unwrap());
+
+        // Generate a header containing bindings to the crate.
+        match cbindgen::Builder::new()
+            .with_pragma_once(true)
+            .with_autogen_warning(AUTOGEN_WARNING)
+            .with_no_includes()
+            .with_sys_include("cstdint")
+            .with_sys_include("cstddef")
+            .with_parse_deps(true)
+            .with_parse_include(&[
+                "sp1-stark",
+                "sp1-primitives",
+                "sp1-core-machine",
+                "p3-baby-bear",
+                "sp1-core-executor",
+            ])
+            .with_parse_extra_bindings(&["sp1-stark", "sp1-primitives", "p3-baby-bear"])
+            .rename_item("BabyBear", "BabyBearP3")
+            .include_item("BaseAluEvent")
+            .include_item("BaseAluValueCols")
+            .include_item("BaseAluAccessCols")
+            .include_item("BaseAluInstr")
+            .include_item("ExtAluEvent")
+            .include_item("ExtAluValueCols")
+            .include_item("ExtAluInstr")
+            .include_item("ExtAluAccessCols")
+            .include_item("BatchFRIEvent")
+            .include_item("BatchFRICols")
+            .include_item("BatchFRIInstrFFI")
+            .include_item("BatchFRIPreprocessedCols")
+            .include_item("ExpReverseBitsEventFFI")
+            .include_item("ExpReverseBitsLenCols")
+            .include_item("ExpReverseBitsInstrFFI")
+            .include_item("ExpReverseBitsLenPreprocessedCols")
+            .include_item("FriFoldEvent")
+            .include_item("FriFoldCols")
+            .include_item("FriFoldInstrFFI")
+            .include_item("FriFoldPreprocessedCols")
+            .include_item("SelectEvent")
+            .include_item("SelectCols")
+            .include_item("CommitPublicValuesEvent")
+            .include_item("PublicValuesCols")
+            .include_item("CommitPublicValuesInstr")
+            .include_item("PublicValuesPreprocessedCols")
+            .include_item("SelectEvent")
+            .include_item("SelectCols")
+            .include_item("SelectInstr")
+            .include_item("SelectPreprocessedCols")
+            .include_item("Poseidon2Event")
+            .include_item("Poseidon2")
+            .include_item("Poseidon2Instr")
+            .include_item("Poseidon2PreprocessedColsSkinny")
+            .include_item("Poseidon2PreprocessedColsWide")
+            .include_item("OUTPUT_ROUND_IDX")
+            .with_namespace("sp1_recursion_core_sys")
+            .with_crate(crate_dir)
+            .generate()
+        {
+            Ok(bindings) => {
+                // Write the bindings to the target include directory.
+                let header_path = target_include_dir.join(cbindgen_hpp);
+                if bindings.write_to_file(&header_path) {
+                    // Symlink the header to the fixed include directory.
+                    rel_symlink_file(header_path, target_include_dir_fixed.join(cbindgen_hpp));
+                }
+            }
+            Err(cbindgen::Error::ParseSyntaxError { .. }) => {} // Ignore parse errors so rust-analyzer can run.
+            Err(e) => panic!("{:?}", e),
+        }
+
+        // Copy the headers to the include directory and symlink them to the fixed include directory.
+        for header in &headers {
+            // Get the path of the header relative to the source include directory.
+            let relpath = diff_paths(header, &source_include_dir).unwrap();
+
+            // Let the destination path be the same place relative to the target include directory.
+            let dst = target_include_dir.join(&relpath);
+
+            // Create the parent directory if it does not exist.
+            if let Some(parent) = dst.parent() {
+                fs::create_dir_all(parent).unwrap();
+            }
+            fs::copy(header, &dst).unwrap();
+            rel_symlink_file(dst, target_include_dir_fixed.join(relpath));
+        }
+
+        println!("cargo::rustc-link-lib=static=sp1-core-machine-sys");
+        let include_dir = env::var("DEP_SP1_CORE_MACHINE_SYS_INCLUDE").unwrap();
+
+        // Use the `cc` crate to build the library and statically link it to the crate.
+        let mut cc_builder = cc::Build::new();
+        cc_builder.files(&compilation_units).include(target_include_dir).include(include_dir);
+        cc_builder.cpp(true).std("c++20");
+        cc_builder.compile(LIB_NAME)
+    }
+
+    /// Place a relative symlink pointing to `original` at `link`.
+    fn rel_symlink_file<P, Q>(original: P, link: Q)
+    where
+        P: AsRef<Path>,
+        Q: AsRef<Path>,
+    {
+        #[cfg(unix)]
+        use os::unix::fs::symlink;
+        #[cfg(windows)]
+        use os::windows::fs::symlink_file as symlink;
+
+        let target_dir = link.as_ref().parent().unwrap();
+        fs::create_dir_all(target_dir).unwrap();
+        let _ = fs::remove_file(&link);
+        let relpath = diff_paths(original, target_dir).unwrap();
+        symlink(relpath, link).unwrap();
+    }
+}
diff --git a/crates/recursion/core/cpp/extern.cpp b/crates/recursion/core/cpp/extern.cpp
new file mode 100644
index 0000000000..12354c4dd7
--- /dev/null
+++ b/crates/recursion/core/cpp/extern.cpp
@@ -0,0 +1,135 @@
+#include "bb31_t.hpp"
+#include "sys.hpp"
+
+namespace sp1_recursion_core_sys {
+extern void alu_base_event_to_row_babybear(const BaseAluIo<BabyBearP3>* io,
+                                           BaseAluValueCols<BabyBearP3>* cols) {
+  alu_base::event_to_row<bb31_t>(
+      *reinterpret_cast<const BaseAluIo<bb31_t>*>(io),
+      *reinterpret_cast<BaseAluValueCols<bb31_t>*>(cols));
+}
+extern void alu_base_instr_to_row_babybear(
+    const BaseAluInstr<BabyBearP3>* instr,
+    BaseAluAccessCols<BabyBearP3>* access) {
+  alu_base::instr_to_row<bb31_t>(
+      *reinterpret_cast<const BaseAluInstr<bb31_t>*>(instr),
+      *reinterpret_cast<BaseAluAccessCols<bb31_t>*>(access));
+}
+
+extern void alu_ext_event_to_row_babybear(const ExtAluIo<Block<BabyBearP3>>* io,
+                                          ExtAluValueCols<BabyBearP3>* cols) {
+  alu_ext::event_to_row<bb31_t>(
+      *reinterpret_cast<const ExtAluIo<Block<bb31_t>>*>(io),
+      *reinterpret_cast<ExtAluValueCols<bb31_t>*>(cols));
+}
+extern void alu_ext_instr_to_row_babybear(
+    const ExtAluInstr<BabyBearP3>* instr,
+    ExtAluAccessCols<BabyBearP3>* access) {
+  alu_ext::instr_to_row<bb31_t>(
+      *reinterpret_cast<const ExtAluInstr<bb31_t>*>(instr),
+      *reinterpret_cast<ExtAluAccessCols<bb31_t>*>(access));
+}
+
+extern void batch_fri_event_to_row_babybear(const BatchFRIEvent<BabyBearP3>* io,
+                                            BatchFRICols<BabyBearP3>* cols) {
+  batch_fri::event_to_row<bb31_t>(
+      *reinterpret_cast<const BatchFRIEvent<bb31_t>*>(io),
+      *reinterpret_cast<BatchFRICols<bb31_t>*>(cols));
+}
+extern void batch_fri_instr_to_row_babybear(
+    const BatchFRIInstrFFI<BabyBearP3>* instr,
+    BatchFRIPreprocessedCols<BabyBearP3>* cols) {
+  batch_fri::instr_to_row<bb31_t>(
+      *reinterpret_cast<const BatchFRIInstrFFI<bb31_t>*>(instr),
+      *reinterpret_cast<BatchFRIPreprocessedCols<bb31_t>*>(cols));
+}
+
+extern void exp_reverse_bits_event_to_row_babybear(
+    const ExpReverseBitsEventFFI<BabyBearP3>* io, size_t i,
+    ExpReverseBitsLenCols<BabyBearP3>* cols) {
+  exp_reverse_bits::event_to_row<bb31_t>(
+      *reinterpret_cast<const ExpReverseBitsEventFFI<bb31_t>*>(io), i,
+      *reinterpret_cast<ExpReverseBitsLenCols<bb31_t>*>(cols));
+}
+extern void exp_reverse_bits_instr_to_row_babybear(
+    const ExpReverseBitsInstrFFI<BabyBearP3>* instr, size_t i, size_t len,
+    ExpReverseBitsLenPreprocessedCols<BabyBearP3>* cols) {
+  exp_reverse_bits::instr_to_row<bb31_t>(
+      *reinterpret_cast<const ExpReverseBitsInstrFFI<bb31_t>*>(instr), i, len,
+      *reinterpret_cast<ExpReverseBitsLenPreprocessedCols<bb31_t>*>(cols));
+}
+
+extern void fri_fold_event_to_row_babybear(const FriFoldEvent<BabyBearP3>* io,
+                                           FriFoldCols<BabyBearP3>* cols) {
+  fri_fold::event_to_row<bb31_t>(
+      *reinterpret_cast<const FriFoldEvent<bb31_t>*>(io),
+      *reinterpret_cast<FriFoldCols<bb31_t>*>(cols));
+}
+extern void fri_fold_instr_to_row_babybear(
+    const FriFoldInstrFFI<BabyBearP3>* instr, size_t i,
+    FriFoldPreprocessedCols<BabyBearP3>* cols) {
+  fri_fold::instr_to_row<bb31_t>(
+      *reinterpret_cast<const FriFoldInstrFFI<bb31_t>*>(instr), i,
+      *reinterpret_cast<FriFoldPreprocessedCols<bb31_t>*>(cols));
+}
+
+extern void public_values_event_to_row_babybear(
+    const CommitPublicValuesEvent<BabyBearP3>* io, size_t digest_idx,
+    PublicValuesCols<BabyBearP3>* cols) {
+  public_values::event_to_row<bb31_t>(
+      *reinterpret_cast<const CommitPublicValuesEvent<bb31_t>*>(io), digest_idx,
+      *reinterpret_cast<PublicValuesCols<bb31_t>*>(cols));
+}
+extern void public_values_instr_to_row_babybear(
+    const CommitPublicValuesInstr<BabyBearP3>* instr, size_t digest_idx,
+    PublicValuesPreprocessedCols<BabyBearP3>* cols) {
+  public_values::instr_to_row<bb31_t>(
+      *reinterpret_cast<const CommitPublicValuesInstr<bb31_t>*>(instr),
+      digest_idx,
+      *reinterpret_cast<PublicValuesPreprocessedCols<bb31_t>*>(cols));
+}
+
+extern void select_event_to_row_babybear(const SelectEvent<BabyBearP3>* io,
+                                         SelectCols<BabyBearP3>* cols) {
+  select::event_to_row<bb31_t>(
+      *reinterpret_cast<const SelectEvent<bb31_t>*>(io),
+      *reinterpret_cast<SelectCols<bb31_t>*>(cols));
+}
+extern void select_instr_to_row_babybear(
+    const SelectInstr<BabyBearP3>* instr,
+    SelectPreprocessedCols<BabyBearP3>* cols) {
+  select::instr_to_row<bb31_t>(
+      *reinterpret_cast<const SelectInstr<bb31_t>*>(instr),
+      *reinterpret_cast<SelectPreprocessedCols<bb31_t>*>(cols));
+}
+
+extern void poseidon2_skinny_event_to_row_babybear(
+    const Poseidon2Event<BabyBearP3>* event, Poseidon2<BabyBearP3>* cols) {
+  poseidon2_skinny::event_to_row<bb31_t>(
+      *reinterpret_cast<const Poseidon2Event<bb31_t>*>(event),
+      reinterpret_cast<Poseidon2<bb31_t>*>(cols));
+}
+extern void poseidon2_skinny_instr_to_row_babybear(
+    const Poseidon2Instr<BabyBearP3>* instr, size_t i,
+    Poseidon2PreprocessedColsSkinny<BabyBearP3>* cols) {
+  poseidon2_skinny::instr_to_row<bb31_t>(
+      *reinterpret_cast<const Poseidon2Instr<bb31_t>*>(instr), i,
+      *reinterpret_cast<Poseidon2PreprocessedColsSkinny<bb31_t>*>(cols));
+}
+
+extern "C" void poseidon2_wide_event_to_row_babybear(const BabyBearP3* input,
+                                                     BabyBearP3* input_row,
+                                                     bool sbox_state) {
+  poseidon2_wide::event_to_row<bb31_t>(reinterpret_cast<const bb31_t*>(input),
+                                       reinterpret_cast<bb31_t*>(input_row), 0,
+                                       1, sbox_state);
+}
+
+extern void poseidon2_wide_instr_to_row_babybear(
+    const Poseidon2SkinnyInstr<BabyBearP3>* instr,
+    Poseidon2PreprocessedColsWide<BabyBearP3>* cols) {
+  poseidon2_wide::instr_to_row<bb31_t>(
+      *reinterpret_cast<const Poseidon2SkinnyInstr<bb31_t>*>(instr),
+      *reinterpret_cast<Poseidon2PreprocessedColsWide<bb31_t>*>(cols));
+}
+}  // namespace sp1_recursion_core_sys
diff --git a/crates/recursion/core/include/alu_base.hpp b/crates/recursion/core/include/alu_base.hpp
new file mode 100644
index 0000000000..4a85532e00
--- /dev/null
+++ b/crates/recursion/core/include/alu_base.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::alu_base {
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const BaseAluEvent<F>& event,
+                                  BaseAluValueCols<F>& cols) {
+  cols.vals = event;
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const BaseAluInstr<F>& instr,
+                                  BaseAluAccessCols<F>& access) {
+  access.addrs = instr.addrs;
+  access.is_add = F(0);
+  access.is_sub = F(0);
+  access.is_mul = F(0);
+  access.is_div = F(0);
+  access.mult = instr.mult;
+
+  switch (instr.opcode) {
+    case BaseAluOpcode::AddF:
+      access.is_add = F(1);
+      break;
+    case BaseAluOpcode::SubF:
+      access.is_sub = F(1);
+      break;
+    case BaseAluOpcode::MulF:
+      access.is_mul = F(1);
+      break;
+    case BaseAluOpcode::DivF:
+      access.is_div = F(1);
+      break;
+  }
+}
+}  // namespace sp1_recursion_core_sys::alu_base
diff --git a/crates/recursion/core/include/alu_ext.hpp b/crates/recursion/core/include/alu_ext.hpp
new file mode 100644
index 0000000000..0448e6830c
--- /dev/null
+++ b/crates/recursion/core/include/alu_ext.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::alu_ext {
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const ExtAluEvent<F>& event,
+                                  ExtAluValueCols<F>& cols) {
+  cols.vals = event;
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const ExtAluInstr<F>& instr,
+                                  ExtAluAccessCols<F>& access) {
+  access.addrs = instr.addrs;
+  access.is_add = F(0);
+  access.is_sub = F(0);
+  access.is_mul = F(0);
+  access.is_div = F(0);
+  access.mult = instr.mult;
+
+  switch (instr.opcode) {
+    case ExtAluOpcode::AddE:
+      access.is_add = F(1);
+      break;
+    case ExtAluOpcode::SubE:
+      access.is_sub = F(1);
+      break;
+    case ExtAluOpcode::MulE:
+      access.is_mul = F(1);
+      break;
+    case ExtAluOpcode::DivE:
+      access.is_div = F(1);
+      break;
+  }
+}
+}  // namespace sp1_recursion_core_sys::alu_ext
diff --git a/crates/recursion/core/include/batch_fri.hpp b/crates/recursion/core/include/batch_fri.hpp
new file mode 100644
index 0000000000..b9ab20ea23
--- /dev/null
+++ b/crates/recursion/core/include/batch_fri.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::batch_fri {
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const BatchFRIEvent<F>& event,
+                                  BatchFRICols<F>& cols) {
+  cols.acc = event.ext_single.acc;
+  cols.alpha_pow = event.ext_vec.alpha_pow;
+  cols.p_at_z = event.ext_vec.p_at_z;
+  cols.p_at_x = event.base_vec.p_at_x;
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const BatchFRIInstrFFI<F>& instr,
+                                  BatchFRIPreprocessedCols<F>& cols) {
+  cols.is_real = F(1);
+  cols.is_end =
+      F(instr.ext_vec_addrs_p_at_z_ptr ==
+        instr.ext_vec_addrs_p_at_z_ptr + instr.ext_vec_addrs_p_at_z_len - 1);
+  cols.acc_addr = instr.ext_single_addrs->acc;
+  cols.alpha_pow_addr = instr.ext_vec_addrs_alpha_pow_ptr[0];
+  cols.p_at_z_addr = instr.ext_vec_addrs_p_at_z_ptr[0];
+  cols.p_at_x_addr = instr.base_vec_addrs_p_at_x_ptr[0];
+}
+}  // namespace sp1_recursion_core_sys::batch_fri
diff --git a/crates/recursion/core/include/exp_reverse_bits.hpp b/crates/recursion/core/include/exp_reverse_bits.hpp
new file mode 100644
index 0000000000..840258315f
--- /dev/null
+++ b/crates/recursion/core/include/exp_reverse_bits.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::exp_reverse_bits {
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const ExpReverseBitsEventFFI<F>& event,
+                                  size_t i, ExpReverseBitsLenCols<F>& cols) {
+  cols.x = *event.base;
+  cols.current_bit = event.exp_ptr[i];
+  cols.multiplier = (event.exp_ptr[i] == F::one()) ? *event.base : F::one();
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const ExpReverseBitsInstrFFI<F>& instr,
+                                  size_t i, size_t len,
+                                  ExpReverseBitsLenPreprocessedCols<F>& cols) {
+  cols.is_real = F::one();
+  cols.iteration_num = F::from_canonical_u32(i);
+  cols.is_first = F::from_bool(i == 0);
+  cols.is_last = F::from_bool(i == len - 1);
+
+  cols.x_mem.addr = *instr.base;
+  cols.x_mem.mult = F::zero() - F::from_bool(i == 0);
+
+  cols.exponent_mem.addr = instr.exp_ptr[i];
+  cols.exponent_mem.mult = F::zero() - F::one();
+
+  cols.result_mem.addr = *instr.result;
+  cols.result_mem.mult = *instr.mult * F::from_bool(i == len - 1);
+}
+}  // namespace sp1_recursion_core_sys::exp_reverse_bits
diff --git a/crates/recursion/core/include/fri_fold.hpp b/crates/recursion/core/include/fri_fold.hpp
new file mode 100644
index 0000000000..3ea30aa351
--- /dev/null
+++ b/crates/recursion/core/include/fri_fold.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::fri_fold {
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const FriFoldEvent<F>& event,
+                                  FriFoldCols<F>& cols) {
+  cols.x = event.base_single.x;
+  cols.z = event.ext_single.z;
+  cols.alpha = event.ext_single.alpha;
+
+  cols.p_at_z = event.ext_vec.ps_at_z;
+  cols.p_at_x = event.ext_vec.mat_opening;
+  cols.alpha_pow_input = event.ext_vec.alpha_pow_input;
+  cols.ro_input = event.ext_vec.ro_input;
+
+  cols.alpha_pow_output = event.ext_vec.alpha_pow_output;
+  cols.ro_output = event.ext_vec.ro_output;
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const FriFoldInstrFFI<F>& instr, size_t i,
+                                  FriFoldPreprocessedCols<F>& cols) {
+
+  cols.is_real = F::one();
+  cols.is_first = F::from_bool(i == 0);
+
+  cols.z_mem.addr = instr.ext_single_addrs->z;
+  cols.z_mem.mult = F::zero() - F::from_bool(i == 0);
+
+  cols.x_mem.addr = instr.base_single_addrs->x;
+  cols.x_mem.mult = F::zero() - F::from_bool(i == 0);
+
+  cols.alpha_mem.addr = instr.ext_single_addrs->alpha;
+  cols.alpha_mem.mult = F::zero() - F::from_bool(i == 0);
+
+  cols.alpha_pow_input_mem.addr = instr.ext_vec_addrs_alpha_pow_input_ptr[i];
+  cols.alpha_pow_input_mem.mult = F::zero() - F::one();
+
+  cols.ro_input_mem.addr = instr.ext_vec_addrs_ro_input_ptr[i];
+  cols.ro_input_mem.mult = F::zero() - F::one();
+
+  cols.p_at_z_mem.addr = instr.ext_vec_addrs_ps_at_z_ptr[i];
+  cols.p_at_z_mem.mult = F::zero() - F::one();
+
+  cols.p_at_x_mem.addr = instr.ext_vec_addrs_mat_opening_ptr[i];
+  cols.p_at_x_mem.mult = F::zero() - F::one();
+
+  cols.alpha_pow_output_mem.addr = instr.ext_vec_addrs_alpha_pow_output_ptr[i];
+  cols.alpha_pow_output_mem.mult = instr.alpha_pow_mults_ptr[i];
+
+  cols.ro_output_mem.addr = instr.ext_vec_addrs_ro_output_ptr[i];
+  cols.ro_output_mem.mult = instr.ro_mults_ptr[i];
+}
+}  // namespace sp1_recursion_core_sys::fri_fold
diff --git a/crates/recursion/core/include/poseidon2.hpp b/crates/recursion/core/include/poseidon2.hpp
new file mode 100644
index 0000000000..2234d916bf
--- /dev/null
+++ b/crates/recursion/core/include/poseidon2.hpp
@@ -0,0 +1,75 @@
+#pragma once
+
+#include "poseidon2_constants.hpp"
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::poseidon2 {
+using namespace constants;
+
+constexpr size_t INPUT_ROUND_IDX = 0;
+constexpr size_t INTERNAL_ROUND_IDX = NUM_EXTERNAL_ROUNDS / 2 + 1;
+
+constexpr size_t NUM_ROUNDS = OUTPUT_ROUND_IDX + 1;
+
+constexpr size_t PERMUTATION_NO_SBOX =
+    (WIDTH * NUM_EXTERNAL_ROUNDS) + WIDTH + (NUM_INTERNAL_ROUNDS - 1) + WIDTH;
+constexpr size_t PERMUTATION_SBOX =
+    PERMUTATION_NO_SBOX + (WIDTH * NUM_EXTERNAL_ROUNDS) + NUM_INTERNAL_ROUNDS;
+
+constexpr size_t POSEIDON2_WIDTH = 16;
+
+template <class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void mdsLightPermutation4x4(F state[4]) {
+  F t01 = state[0] + state[1];
+  F t23 = state[2] + state[3];
+  F t0123 = t01 + t23;
+  F t01123 = t0123 + state[1];
+  F t01233 = t0123 + state[3];
+  state[3] = t01233 + operator<<(state[0], 1);
+  state[1] = t01123 + operator<<(state[2], 1);
+  state[0] = t01123 + t01;
+  state[2] = t01233 + t23;
+}
+
+template <class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void external_linear_layer(F state_var[POSEIDON2_WIDTH]) {
+  for (int i = 0; i < POSEIDON2_WIDTH; i += 4) {
+    mdsLightPermutation4x4(state_var + i);
+  }
+
+  F sums[4] = {F::zero(), F::zero(), F::zero(), F::zero()};
+  for (size_t k = 0; k < 4; k++) {
+    for (size_t j = 0; j < POSEIDON2_WIDTH; j += 4) {
+      sums[k] = sums[k] + state_var[j + k];
+    }
+  }
+
+  for (size_t j = 0; j < POSEIDON2_WIDTH; j++) {
+    state_var[j] = state_var[j] + sums[j % 4];
+  }
+}
+
+template <class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void internal_linear_layer(F state[POSEIDON2_WIDTH]) {
+  F matmul_constants[POSEIDON2_WIDTH];
+  for (size_t i = 0; i < POSEIDON2_WIDTH; i++) {
+    matmul_constants[i] = F(F::to_monty(F::from_monty(
+        constants::POSEIDON2_INTERNAL_MATRIX_DIAG_16_BABYBEAR_MONTY[i].val)));
+  }
+
+  F sum = F::zero();
+  for (size_t i = 0; i < POSEIDON2_WIDTH; i++) {
+    sum = sum + state[i];
+  }
+
+  for (size_t i = 0; i < POSEIDON2_WIDTH; i++) {
+    state[i] = state[i] * matmul_constants[i];
+    state[i] = state[i] + sum;
+  }
+
+  F monty_inverse = F(F::to_monty(F::from_monty(1)));
+  for (size_t i = 0; i < POSEIDON2_WIDTH; i++) {
+    state[i] = state[i] * monty_inverse;
+  }
+}
+}  // namespace sp1_recursion_core_sys::poseidon2
\ No newline at end of file
diff --git a/crates/recursion/core/include/poseidon2_constants.hpp b/crates/recursion/core/include/poseidon2_constants.hpp
new file mode 100644
index 0000000000..c076d6b64a
--- /dev/null
+++ b/crates/recursion/core/include/poseidon2_constants.hpp
@@ -0,0 +1,1100 @@
+#pragma once
+
+namespace sp1_recursion_core_sys::constants {
+#ifdef __CUDA_ARCH__
+__constant__ constexpr const uint32_t RC_16_30_U32[30][16] = {
+    {
+        2110014213U,
+        3964964605U,
+        2190662774U,
+        2732996483U,
+        640767983U,
+        3403899136U,
+        1716033721U,
+        1606702601U,
+        3759873288U,
+        1466015491U,
+        1498308946U,
+        2844375094U,
+        3042463841U,
+        1969905919U,
+        4109944726U,
+        3925048366U,
+    },
+    {
+        3706859504U,
+        759122502U,
+        3167665446U,
+        1131812921U,
+        1080754908U,
+        4080114493U,
+        893583089U,
+        2019677373U,
+        3128604556U,
+        580640471U,
+        3277620260U,
+        842931656U,
+        548879852U,
+        3608554714U,
+        3575647916U,
+        81826002U,
+    },
+    {
+        4289086263U,
+        1563933798U,
+        1440025885U,
+        184445025U,
+        2598651360U,
+        1396647410U,
+        1575877922U,
+        3303853401U,
+        137125468U,
+        765010148U,
+        633675867U,
+        2037803363U,
+        2573389828U,
+        1895729703U,
+        541515871U,
+        1783382863U,
+    },
+    {
+        2641856484U,
+        3035743342U,
+        3672796326U,
+        245668751U,
+        2025460432U,
+        201609705U,
+        286217151U,
+        4093475563U,
+        2519572182U,
+        3080699870U,
+        2762001832U,
+        1244250808U,
+        606038199U,
+        3182740831U,
+        73007766U,
+        2572204153U,
+    },
+    {
+        1196780786U,
+        3447394443U,
+        747167305U,
+        2968073607U,
+        1053214930U,
+        1074411832U,
+        4016794508U,
+        1570312929U,
+        113576933U,
+        4042581186U,
+        3634515733U,
+        1032701597U,
+        2364839308U,
+        3840286918U,
+        888378655U,
+        2520191583U,
+    },
+    {
+        36046858U,
+        2927525953U,
+        3912129105U,
+        4004832531U,
+        193772436U,
+        1590247392U,
+        4125818172U,
+        2516251696U,
+        4050945750U,
+        269498914U,
+        1973292656U,
+        891403491U,
+        1845429189U,
+        2611996363U,
+        2310542653U,
+        4071195740U,
+    },
+    {
+        3505307391U,
+        786445290U,
+        3815313971U,
+        1111591756U,
+        4233279834U,
+        2775453034U,
+        1991257625U,
+        2940505809U,
+        2751316206U,
+        1028870679U,
+        1282466273U,
+        1059053371U,
+        834521354U,
+        138721483U,
+        3100410803U,
+        3843128331U,
+    },
+    {
+        3878220780U,
+        4058162439U,
+        1478942487U,
+        799012923U,
+        496734827U,
+        3521261236U,
+        755421082U,
+        1361409515U,
+        392099473U,
+        3178453393U,
+        4068463721U,
+        7935614U,
+        4140885645U,
+        2150748066U,
+        1685210312U,
+        3852983224U,
+    },
+    {
+        2896943075U,
+        3087590927U,
+        992175959U,
+        970216228U,
+        3473630090U,
+        3899670400U,
+        3603388822U,
+        2633488197U,
+        2479406964U,
+        2420952999U,
+        1852516800U,
+        4253075697U,
+        979699862U,
+        1163403191U,
+        1608599874U,
+        3056104448U,
+    },
+    {
+        3779109343U,
+        536205958U,
+        4183458361U,
+        1649720295U,
+        1444912244U,
+        3122230878U,
+        384301396U,
+        4228198516U,
+        1662916865U,
+        4082161114U,
+        2121897314U,
+        1706239958U,
+        4166959388U,
+        1626054781U,
+        3005858978U,
+        1431907253U,
+    },
+    {
+        1418914503U,
+        1365856753U,
+        3942715745U,
+        1429155552U,
+        3545642795U,
+        3772474257U,
+        1621094396U,
+        2154399145U,
+        826697382U,
+        1700781391U,
+        3539164324U,
+        652815039U,
+        442484755U,
+        2055299391U,
+        1064289978U,
+        1152335780U,
+    },
+    {
+        3417648695U,
+        186040114U,
+        3475580573U,
+        2113941250U,
+        1779573826U,
+        1573808590U,
+        3235694804U,
+        2922195281U,
+        1119462702U,
+        3688305521U,
+        1849567013U,
+        667446787U,
+        753897224U,
+        1896396780U,
+        3143026334U,
+        3829603876U,
+    },
+    {
+        859661334U,
+        3898844357U,
+        180258337U,
+        2321867017U,
+        3599002504U,
+        2886782421U,
+        3038299378U,
+        1035366250U,
+        2038912197U,
+        2920174523U,
+        1277696101U,
+        2785700290U,
+        3806504335U,
+        3518858933U,
+        654843672U,
+        2127120275U,
+    },
+    {
+        1548195514U,
+        2378056027U,
+        390914568U,
+        1472049779U,
+        1552596765U,
+        1905886441U,
+        1611959354U,
+        3653263304U,
+        3423946386U,
+        340857935U,
+        2208879480U,
+        139364268U,
+        3447281773U,
+        3777813707U,
+        55640413U,
+        4101901741U,
+    },
+    {104929687U, 1459980974U, 1831234737U, 457139004U, 2581487628U, 2112044563U,
+     3567013861U, 2792004347U, 576325418U, 41126132U, 2713562324U, 151213722U,
+     2891185935U, 546846420U, 2939794919U, 2543469905U},
+    {
+        2191909784U,
+        3315138460U,
+        530414574U,
+        1242280418U,
+        1211740715U,
+        3993672165U,
+        2505083323U,
+        3845798801U,
+        538768466U,
+        2063567560U,
+        3366148274U,
+        1449831887U,
+        2408012466U,
+        294726285U,
+        3943435493U,
+        924016661U,
+    },
+    {
+        3633138367U,
+        3222789372U,
+        809116305U,
+        30100013U,
+        2655172876U,
+        2564247117U,
+        2478649732U,
+        4113689151U,
+        4120146082U,
+        2512308515U,
+        650406041U,
+        4240012393U,
+        2683508708U,
+        951073977U,
+        3460081988U,
+        339124269U,
+    },
+    {
+        130182653U,
+        2755946749U,
+        542600513U,
+        2816103022U,
+        1931786340U,
+        2044470840U,
+        1709908013U,
+        2938369043U,
+        3640399693U,
+        1374470239U,
+        2191149676U,
+        2637495682U,
+        4236394040U,
+        2289358846U,
+        3833368530U,
+        974546524U,
+    },
+    {
+        3306659113U,
+        2234814261U,
+        1188782305U,
+        223782844U,
+        2248980567U,
+        2309786141U,
+        2023401627U,
+        3278877413U,
+        2022138149U,
+        575851471U,
+        1612560780U,
+        3926656936U,
+        3318548977U,
+        2591863678U,
+        188109355U,
+        4217723909U,
+    },
+    {
+        1564209905U,
+        2154197895U,
+        2459687029U,
+        2870634489U,
+        1375012945U,
+        1529454825U,
+        306140690U,
+        2855578299U,
+        1246997295U,
+        3024298763U,
+        1915270363U,
+        1218245412U,
+        2479314020U,
+        2989827755U,
+        814378556U,
+        4039775921U,
+    },
+    {
+        1165280628U,
+        1203983801U,
+        3814740033U,
+        1919627044U,
+        600240215U,
+        773269071U,
+        486685186U,
+        4254048810U,
+        1415023565U,
+        502840102U,
+        4225648358U,
+        510217063U,
+        166444818U,
+        1430745893U,
+        1376516190U,
+        1775891321U,
+    },
+    {
+        1170945922U,
+        1105391877U,
+        261536467U,
+        1401687994U,
+        1022529847U,
+        2476446456U,
+        2603844878U,
+        3706336043U,
+        3463053714U,
+        1509644517U,
+        588552318U,
+        65252581U,
+        3696502656U,
+        2183330763U,
+        3664021233U,
+        1643809916U,
+    },
+    {
+        2922875898U,
+        3740690643U,
+        3932461140U,
+        161156271U,
+        2619943483U,
+        4077039509U,
+        2921201703U,
+        2085619718U,
+        2065264646U,
+        2615693812U,
+        3116555433U,
+        246100007U,
+        4281387154U,
+        4046141001U,
+        4027749321U,
+        111611860U,
+    },
+    {
+        2066954820U,
+        2502099969U,
+        2915053115U,
+        2362518586U,
+        366091708U,
+        2083204932U,
+        4138385632U,
+        3195157567U,
+        1318086382U,
+        521723799U,
+        702443405U,
+        2507670985U,
+        1760347557U,
+        2631999893U,
+        1672737554U,
+        1060867760U,
+    },
+    {
+        2359801781U,
+        2800231467U,
+        3010357035U,
+        1035997899U,
+        1210110952U,
+        1018506770U,
+        2799468177U,
+        1479380761U,
+        1536021911U,
+        358993854U,
+        579904113U,
+        3432144800U,
+        3625515809U,
+        199241497U,
+        4058304109U,
+        2590164234U,
+    },
+    {
+        1688530738U,
+        1580733335U,
+        2443981517U,
+        2206270565U,
+        2780074229U,
+        2628739677U,
+        2940123659U,
+        4145206827U,
+        3572278009U,
+        2779607509U,
+        1098718697U,
+        1424913749U,
+        2224415875U,
+        1108922178U,
+        3646272562U,
+        3935186184U,
+    },
+    {
+        820046587U,
+        1393386250U,
+        2665818575U,
+        2231782019U,
+        672377010U,
+        1920315467U,
+        1913164407U,
+        2029526876U,
+        2629271820U,
+        384320012U,
+        4112320585U,
+        3131824773U,
+        2347818197U,
+        2220997386U,
+        1772368609U,
+        2579960095U,
+    },
+    {
+        3544930873U,
+        225847443U,
+        3070082278U,
+        95643305U,
+        3438572042U,
+        3312856509U,
+        615850007U,
+        1863868773U,
+        803582265U,
+        3461976859U,
+        2903025799U,
+        1482092434U,
+        3902972499U,
+        3872341868U,
+        1530411808U,
+        2214923584U,
+    },
+    {
+        3118792481U,
+        2241076515U,
+        3983669831U,
+        3180915147U,
+        3838626501U,
+        1921630011U,
+        3415351771U,
+        2249953859U,
+        3755081630U,
+        486327260U,
+        1227575720U,
+        3643869379U,
+        2982026073U,
+        2466043731U,
+        1982634375U,
+        3769609014U,
+    },
+    {
+        2195455495U,
+        2596863283U,
+        4244994973U,
+        1983609348U,
+        4019674395U,
+        3469982031U,
+        1458697570U,
+        1593516217U,
+        1963896497U,
+        3115309118U,
+        1659132465U,
+        2536770756U,
+        3059294171U,
+        2618031334U,
+        2040903247U,
+        3799795076U,
+    }};
+#else
+constexpr const uint32_t RC_16_30_U32[30][16] = {
+    {
+        2110014213U,
+        3964964605U,
+        2190662774U,
+        2732996483U,
+        640767983U,
+        3403899136U,
+        1716033721U,
+        1606702601U,
+        3759873288U,
+        1466015491U,
+        1498308946U,
+        2844375094U,
+        3042463841U,
+        1969905919U,
+        4109944726U,
+        3925048366U,
+    },
+    {
+        3706859504U,
+        759122502U,
+        3167665446U,
+        1131812921U,
+        1080754908U,
+        4080114493U,
+        893583089U,
+        2019677373U,
+        3128604556U,
+        580640471U,
+        3277620260U,
+        842931656U,
+        548879852U,
+        3608554714U,
+        3575647916U,
+        81826002U,
+    },
+    {
+        4289086263U,
+        1563933798U,
+        1440025885U,
+        184445025U,
+        2598651360U,
+        1396647410U,
+        1575877922U,
+        3303853401U,
+        137125468U,
+        765010148U,
+        633675867U,
+        2037803363U,
+        2573389828U,
+        1895729703U,
+        541515871U,
+        1783382863U,
+    },
+    {
+        2641856484U,
+        3035743342U,
+        3672796326U,
+        245668751U,
+        2025460432U,
+        201609705U,
+        286217151U,
+        4093475563U,
+        2519572182U,
+        3080699870U,
+        2762001832U,
+        1244250808U,
+        606038199U,
+        3182740831U,
+        73007766U,
+        2572204153U,
+    },
+    {
+        1196780786U,
+        3447394443U,
+        747167305U,
+        2968073607U,
+        1053214930U,
+        1074411832U,
+        4016794508U,
+        1570312929U,
+        113576933U,
+        4042581186U,
+        3634515733U,
+        1032701597U,
+        2364839308U,
+        3840286918U,
+        888378655U,
+        2520191583U,
+    },
+    {
+        36046858U,
+        2927525953U,
+        3912129105U,
+        4004832531U,
+        193772436U,
+        1590247392U,
+        4125818172U,
+        2516251696U,
+        4050945750U,
+        269498914U,
+        1973292656U,
+        891403491U,
+        1845429189U,
+        2611996363U,
+        2310542653U,
+        4071195740U,
+    },
+    {
+        3505307391U,
+        786445290U,
+        3815313971U,
+        1111591756U,
+        4233279834U,
+        2775453034U,
+        1991257625U,
+        2940505809U,
+        2751316206U,
+        1028870679U,
+        1282466273U,
+        1059053371U,
+        834521354U,
+        138721483U,
+        3100410803U,
+        3843128331U,
+    },
+    {
+        3878220780U,
+        4058162439U,
+        1478942487U,
+        799012923U,
+        496734827U,
+        3521261236U,
+        755421082U,
+        1361409515U,
+        392099473U,
+        3178453393U,
+        4068463721U,
+        7935614U,
+        4140885645U,
+        2150748066U,
+        1685210312U,
+        3852983224U,
+    },
+    {
+        2896943075U,
+        3087590927U,
+        992175959U,
+        970216228U,
+        3473630090U,
+        3899670400U,
+        3603388822U,
+        2633488197U,
+        2479406964U,
+        2420952999U,
+        1852516800U,
+        4253075697U,
+        979699862U,
+        1163403191U,
+        1608599874U,
+        3056104448U,
+    },
+    {
+        3779109343U,
+        536205958U,
+        4183458361U,
+        1649720295U,
+        1444912244U,
+        3122230878U,
+        384301396U,
+        4228198516U,
+        1662916865U,
+        4082161114U,
+        2121897314U,
+        1706239958U,
+        4166959388U,
+        1626054781U,
+        3005858978U,
+        1431907253U,
+    },
+    {
+        1418914503U,
+        1365856753U,
+        3942715745U,
+        1429155552U,
+        3545642795U,
+        3772474257U,
+        1621094396U,
+        2154399145U,
+        826697382U,
+        1700781391U,
+        3539164324U,
+        652815039U,
+        442484755U,
+        2055299391U,
+        1064289978U,
+        1152335780U,
+    },
+    {
+        3417648695U,
+        186040114U,
+        3475580573U,
+        2113941250U,
+        1779573826U,
+        1573808590U,
+        3235694804U,
+        2922195281U,
+        1119462702U,
+        3688305521U,
+        1849567013U,
+        667446787U,
+        753897224U,
+        1896396780U,
+        3143026334U,
+        3829603876U,
+    },
+    {
+        859661334U,
+        3898844357U,
+        180258337U,
+        2321867017U,
+        3599002504U,
+        2886782421U,
+        3038299378U,
+        1035366250U,
+        2038912197U,
+        2920174523U,
+        1277696101U,
+        2785700290U,
+        3806504335U,
+        3518858933U,
+        654843672U,
+        2127120275U,
+    },
+    {
+        1548195514U,
+        2378056027U,
+        390914568U,
+        1472049779U,
+        1552596765U,
+        1905886441U,
+        1611959354U,
+        3653263304U,
+        3423946386U,
+        340857935U,
+        2208879480U,
+        139364268U,
+        3447281773U,
+        3777813707U,
+        55640413U,
+        4101901741U,
+    },
+    {104929687U, 1459980974U, 1831234737U, 457139004U, 2581487628U, 2112044563U,
+     3567013861U, 2792004347U, 576325418U, 41126132U, 2713562324U, 151213722U,
+     2891185935U, 546846420U, 2939794919U, 2543469905U},
+    {
+        2191909784U,
+        3315138460U,
+        530414574U,
+        1242280418U,
+        1211740715U,
+        3993672165U,
+        2505083323U,
+        3845798801U,
+        538768466U,
+        2063567560U,
+        3366148274U,
+        1449831887U,
+        2408012466U,
+        294726285U,
+        3943435493U,
+        924016661U,
+    },
+    {
+        3633138367U,
+        3222789372U,
+        809116305U,
+        30100013U,
+        2655172876U,
+        2564247117U,
+        2478649732U,
+        4113689151U,
+        4120146082U,
+        2512308515U,
+        650406041U,
+        4240012393U,
+        2683508708U,
+        951073977U,
+        3460081988U,
+        339124269U,
+    },
+    {
+        130182653U,
+        2755946749U,
+        542600513U,
+        2816103022U,
+        1931786340U,
+        2044470840U,
+        1709908013U,
+        2938369043U,
+        3640399693U,
+        1374470239U,
+        2191149676U,
+        2637495682U,
+        4236394040U,
+        2289358846U,
+        3833368530U,
+        974546524U,
+    },
+    {
+        3306659113U,
+        2234814261U,
+        1188782305U,
+        223782844U,
+        2248980567U,
+        2309786141U,
+        2023401627U,
+        3278877413U,
+        2022138149U,
+        575851471U,
+        1612560780U,
+        3926656936U,
+        3318548977U,
+        2591863678U,
+        188109355U,
+        4217723909U,
+    },
+    {
+        1564209905U,
+        2154197895U,
+        2459687029U,
+        2870634489U,
+        1375012945U,
+        1529454825U,
+        306140690U,
+        2855578299U,
+        1246997295U,
+        3024298763U,
+        1915270363U,
+        1218245412U,
+        2479314020U,
+        2989827755U,
+        814378556U,
+        4039775921U,
+    },
+    {
+        1165280628U,
+        1203983801U,
+        3814740033U,
+        1919627044U,
+        600240215U,
+        773269071U,
+        486685186U,
+        4254048810U,
+        1415023565U,
+        502840102U,
+        4225648358U,
+        510217063U,
+        166444818U,
+        1430745893U,
+        1376516190U,
+        1775891321U,
+    },
+    {
+        1170945922U,
+        1105391877U,
+        261536467U,
+        1401687994U,
+        1022529847U,
+        2476446456U,
+        2603844878U,
+        3706336043U,
+        3463053714U,
+        1509644517U,
+        588552318U,
+        65252581U,
+        3696502656U,
+        2183330763U,
+        3664021233U,
+        1643809916U,
+    },
+    {
+        2922875898U,
+        3740690643U,
+        3932461140U,
+        161156271U,
+        2619943483U,
+        4077039509U,
+        2921201703U,
+        2085619718U,
+        2065264646U,
+        2615693812U,
+        3116555433U,
+        246100007U,
+        4281387154U,
+        4046141001U,
+        4027749321U,
+        111611860U,
+    },
+    {
+        2066954820U,
+        2502099969U,
+        2915053115U,
+        2362518586U,
+        366091708U,
+        2083204932U,
+        4138385632U,
+        3195157567U,
+        1318086382U,
+        521723799U,
+        702443405U,
+        2507670985U,
+        1760347557U,
+        2631999893U,
+        1672737554U,
+        1060867760U,
+    },
+    {
+        2359801781U,
+        2800231467U,
+        3010357035U,
+        1035997899U,
+        1210110952U,
+        1018506770U,
+        2799468177U,
+        1479380761U,
+        1536021911U,
+        358993854U,
+        579904113U,
+        3432144800U,
+        3625515809U,
+        199241497U,
+        4058304109U,
+        2590164234U,
+    },
+    {
+        1688530738U,
+        1580733335U,
+        2443981517U,
+        2206270565U,
+        2780074229U,
+        2628739677U,
+        2940123659U,
+        4145206827U,
+        3572278009U,
+        2779607509U,
+        1098718697U,
+        1424913749U,
+        2224415875U,
+        1108922178U,
+        3646272562U,
+        3935186184U,
+    },
+    {
+        820046587U,
+        1393386250U,
+        2665818575U,
+        2231782019U,
+        672377010U,
+        1920315467U,
+        1913164407U,
+        2029526876U,
+        2629271820U,
+        384320012U,
+        4112320585U,
+        3131824773U,
+        2347818197U,
+        2220997386U,
+        1772368609U,
+        2579960095U,
+    },
+    {
+        3544930873U,
+        225847443U,
+        3070082278U,
+        95643305U,
+        3438572042U,
+        3312856509U,
+        615850007U,
+        1863868773U,
+        803582265U,
+        3461976859U,
+        2903025799U,
+        1482092434U,
+        3902972499U,
+        3872341868U,
+        1530411808U,
+        2214923584U,
+    },
+    {
+        3118792481U,
+        2241076515U,
+        3983669831U,
+        3180915147U,
+        3838626501U,
+        1921630011U,
+        3415351771U,
+        2249953859U,
+        3755081630U,
+        486327260U,
+        1227575720U,
+        3643869379U,
+        2982026073U,
+        2466043731U,
+        1982634375U,
+        3769609014U,
+    },
+    {
+        2195455495U,
+        2596863283U,
+        4244994973U,
+        1983609348U,
+        4019674395U,
+        3469982031U,
+        1458697570U,
+        1593516217U,
+        1963896497U,
+        3115309118U,
+        1659132465U,
+        2536770756U,
+        3059294171U,
+        2618031334U,
+        2040903247U,
+        3799795076U,
+    }};
+#endif
+
+#ifdef __CUDA_ARCH__
+__constant__ constexpr const bb31_t
+    POSEIDON2_INTERNAL_MATRIX_DIAG_16_BABYBEAR_MONTY[16] = {
+        bb31_t(bb31_t::to_monty(0x78000001u - 2)),  // BabyBear::ORDER_U32 - 2
+        bb31_t(bb31_t::to_monty(1)),                // 1
+        bb31_t(bb31_t::to_monty(1 << 1)),           // 1 << 1
+        bb31_t(bb31_t::to_monty(1 << 2)),           // 1 << 2
+        bb31_t(bb31_t::to_monty(1 << 3)),           // 1 << 3
+        bb31_t(bb31_t::to_monty(1 << 4)),           // 1 << 4
+        bb31_t(bb31_t::to_monty(1 << 5)),           // 1 << 5
+        bb31_t(bb31_t::to_monty(1 << 6)),           // 1 << 6
+        bb31_t(bb31_t::to_monty(1 << 7)),           // 1 << 7
+        bb31_t(bb31_t::to_monty(1 << 8)),           // 1 << 8
+        bb31_t(bb31_t::to_monty(1 << 9)),           // 1 << 9
+        bb31_t(bb31_t::to_monty(1 << 10)),          // 1 << 10
+        bb31_t(bb31_t::to_monty(1 << 11)),          // 1 << 11
+        bb31_t(bb31_t::to_monty(1 << 12)),          // 1 << 12
+        bb31_t(bb31_t::to_monty(1 << 13)),          // 1 << 13
+        bb31_t(bb31_t::to_monty(1 << 15)),          // 1 << 15
+};
+#else
+constexpr const bb31_t POSEIDON2_INTERNAL_MATRIX_DIAG_16_BABYBEAR_MONTY[16] = {
+    bb31_t(bb31_t::to_monty(0x78000001u - 2)),  // BabyBear::ORDER_U32 - 2
+    bb31_t(bb31_t::to_monty(1)),                // 1
+    bb31_t(bb31_t::to_monty(1 << 1)),           // 1 << 1
+    bb31_t(bb31_t::to_monty(1 << 2)),           // 1 << 2
+    bb31_t(bb31_t::to_monty(1 << 3)),           // 1 << 3
+    bb31_t(bb31_t::to_monty(1 << 4)),           // 1 << 4
+    bb31_t(bb31_t::to_monty(1 << 5)),           // 1 << 5
+    bb31_t(bb31_t::to_monty(1 << 6)),           // 1 << 6
+    bb31_t(bb31_t::to_monty(1 << 7)),           // 1 << 7
+    bb31_t(bb31_t::to_monty(1 << 8)),           // 1 << 8
+    bb31_t(bb31_t::to_monty(1 << 9)),           // 1 << 9
+    bb31_t(bb31_t::to_monty(1 << 10)),          // 1 << 10
+    bb31_t(bb31_t::to_monty(1 << 11)),          // 1 << 11
+    bb31_t(bb31_t::to_monty(1 << 12)),          // 1 << 12
+    bb31_t(bb31_t::to_monty(1 << 13)),          // 1 << 13
+    bb31_t(bb31_t::to_monty(1 << 15)),          // 1 << 15
+};
+#endif
+}  // namespace sp1_recursion_core_sys::constants
\ No newline at end of file
diff --git a/crates/recursion/core/include/poseidon2_skinny.hpp b/crates/recursion/core/include/poseidon2_skinny.hpp
new file mode 100644
index 0000000000..cc911a6a18
--- /dev/null
+++ b/crates/recursion/core/include/poseidon2_skinny.hpp
@@ -0,0 +1,116 @@
+#pragma once
+
+#include "poseidon2.hpp"
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::poseidon2_skinny {
+using namespace constants;
+using namespace poseidon2;
+
+template <class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void populate_external_round(
+    F round_state[WIDTH], size_t r, F next_state_var[WIDTH]) {
+  size_t round =
+      (r < NUM_EXTERNAL_ROUNDS / 2) ? r : r + NUM_INTERNAL_ROUNDS - 1;
+
+  for (size_t i = 0; i < WIDTH; i++) {
+    F add_rc = round_state[i] + F(F::to_monty(RC_16_30_U32[round][i]));
+
+    F sbox_deg_3 = add_rc * add_rc * add_rc;
+    next_state_var[i] = sbox_deg_3 * sbox_deg_3 * add_rc;
+  }
+
+  external_linear_layer<F>(next_state_var);
+}
+
+template <class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void populate_internal_rounds(
+    F state[WIDTH], F internal_rounds_s0[NUM_INTERNAL_ROUNDS_S0],
+    F next_state_var[WIDTH]) {
+  for (size_t i = 0; i < WIDTH; i++) {
+    next_state_var[i] = state[i];
+  }
+
+  for (size_t r = 0; r < NUM_INTERNAL_ROUNDS; r++) {
+    size_t round = r + NUM_EXTERNAL_ROUNDS / 2;
+    F add_rc = next_state_var[0] + F(F::to_monty(RC_16_30_U32[round][0]));
+
+    F sbox_deg_3 = add_rc * add_rc * add_rc;
+    F sbox_deg_7 = sbox_deg_3 * sbox_deg_3 * add_rc;
+
+    next_state_var[0] = sbox_deg_7;
+    internal_linear_layer<F>(next_state_var);
+
+    if (r < NUM_INTERNAL_ROUNDS - 1) {
+      internal_rounds_s0[r] = next_state_var[0];
+    }
+  }
+}
+
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const Poseidon2Event<F>& event,
+                                  Poseidon2<F> cols[OUTPUT_ROUND_IDX + 1]) {
+  Poseidon2<F>& first_row = cols[0];
+  for (size_t i = 0; i < 16; i++) {
+    first_row.state_var[i] = event.input[i];
+  }
+
+  Poseidon2<F>& second_row = cols[1];
+  for (size_t i = 0; i < 16; i++) {
+    second_row.state_var[i] = event.input[i];
+  }
+
+  external_linear_layer<F>(second_row.state_var);
+
+  for (size_t i = 1; i < OUTPUT_ROUND_IDX; i++) {
+    Poseidon2<F>& col = cols[i];
+    Poseidon2<F>& next_row_cols = cols[i + 1];
+
+    if (i != INTERNAL_ROUND_IDX) {
+      populate_external_round<F>(col.state_var, i - 1, next_row_cols.state_var);
+    } else {
+      populate_internal_rounds<F>(col.state_var, col.internal_rounds_s0,
+                                  next_row_cols.state_var);
+    }
+  }
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const Poseidon2Instr<F>& instr, size_t i,
+                                  Poseidon2PreprocessedColsSkinny<F>& cols) {
+  cols.round_counters_preprocessed.is_input_round =
+      F::from_bool(i == INPUT_ROUND_IDX);
+  bool is_external_round =
+      i != INPUT_ROUND_IDX && i != INTERNAL_ROUND_IDX && i != OUTPUT_ROUND_IDX;
+  cols.round_counters_preprocessed.is_external_round =
+      F::from_bool(is_external_round);
+  cols.round_counters_preprocessed.is_internal_round =
+      F::from_bool(i == INTERNAL_ROUND_IDX);
+
+  for (size_t j = 0; j < WIDTH; j++) {
+    if (is_external_round) {
+      size_t r = i - 1;
+      size_t round = (i < INTERNAL_ROUND_IDX) ? r : r + NUM_INTERNAL_ROUNDS - 1;
+      cols.round_counters_preprocessed.round_constants[j] =
+          F(F::to_monty(RC_16_30_U32[round][j]));
+    } else if (i == INTERNAL_ROUND_IDX) {
+      cols.round_counters_preprocessed.round_constants[j] =
+          F(F::to_monty(RC_16_30_U32[NUM_EXTERNAL_ROUNDS / 2 + j][0]));
+    } else {
+      cols.round_counters_preprocessed.round_constants[j] = F::zero();
+    }
+  }
+
+  if (i == INPUT_ROUND_IDX) {
+    for (size_t j = 0; j < WIDTH; j++) {
+      cols.memory_preprocessed[j].addr = instr.addrs.input[j];
+      cols.memory_preprocessed[j].mult = F::zero() - F::one();
+    }
+  } else if (i == OUTPUT_ROUND_IDX) {
+    for (size_t j = 0; j < WIDTH; j++) {
+      cols.memory_preprocessed[j].addr = instr.addrs.output[j];
+      cols.memory_preprocessed[j].mult = instr.mults[j];
+    }
+  }
+}
+}  // namespace sp1_recursion_core_sys::poseidon2_skinny
\ No newline at end of file
diff --git a/crates/recursion/core/include/poseidon2_wide.hpp b/crates/recursion/core/include/poseidon2_wide.hpp
new file mode 100644
index 0000000000..8badbfe38b
--- /dev/null
+++ b/crates/recursion/core/include/poseidon2_wide.hpp
@@ -0,0 +1,198 @@
+#pragma once
+
+#include "poseidon2.hpp"
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::poseidon2_wide {
+using namespace constants;
+using namespace poseidon2;
+
+template <class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void populate_external_round(
+    const F external_rounds_state[WIDTH * NUM_EXTERNAL_ROUNDS],
+    F sbox[WIDTH * NUM_EXTERNAL_ROUNDS], size_t r, F next_state[WIDTH]) {
+  F round_state[WIDTH];
+  if (r == 0) {
+    // external_linear_layer_immut
+    F temp_round_state[WIDTH];
+    for (size_t i = 0; i < WIDTH; i++) {
+      temp_round_state[i] = external_rounds_state[r * WIDTH + i];
+    }
+    external_linear_layer<F>(temp_round_state);
+    for (size_t i = 0; i < WIDTH; i++) {
+      round_state[i] = temp_round_state[i];
+    }
+  } else {
+    for (size_t i = 0; i < WIDTH; i++) {
+      round_state[i] = external_rounds_state[r * WIDTH + i];
+    }
+  }
+
+  size_t round = r < NUM_EXTERNAL_ROUNDS / 2 ? r : r + NUM_INTERNAL_ROUNDS;
+  F add_rc[WIDTH];
+  for (size_t i = 0; i < WIDTH; i++) {
+    add_rc[i] = round_state[i] + F(F::to_monty(RC_16_30_U32[round][i]));
+  }
+
+  F sbox_deg_3[WIDTH];
+  F sbox_deg_7[WIDTH];
+  for (size_t i = 0; i < WIDTH; i++) {
+    sbox_deg_3[i] = add_rc[i] * add_rc[i] * add_rc[i];
+    sbox_deg_7[i] = sbox_deg_3[i] * sbox_deg_3[i] * add_rc[i];
+  }
+
+  for (size_t i = 0; i < WIDTH; i++) {
+    sbox[r * WIDTH + i] = sbox_deg_3[i];
+  }
+
+  for (size_t i = 0; i < WIDTH; i++) {
+    next_state[i] = sbox_deg_7[i];
+  }
+  external_linear_layer<F>(next_state);
+}
+
+template <class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void populate_internal_rounds(
+    const F internal_rounds_state[WIDTH],
+    F internal_rounds_s0[NUM_INTERNAL_ROUNDS - 1], F sbox[NUM_INTERNAL_ROUNDS],
+    F ret_state[WIDTH]) {
+  F state[WIDTH];
+  for (size_t i = 0; i < WIDTH; i++) {
+    state[i] = internal_rounds_state[i];
+  }
+
+  F sbox_deg_3[NUM_INTERNAL_ROUNDS];
+  for (size_t r = 0; r < NUM_INTERNAL_ROUNDS; r++) {
+    size_t round = r + NUM_EXTERNAL_ROUNDS / 2;
+    F add_rc = state[0] + F(F::to_monty(RC_16_30_U32[round][0]));
+
+    sbox_deg_3[r] = add_rc * add_rc * add_rc;
+    F sbox_deg_7 = sbox_deg_3[r] * sbox_deg_3[r] * add_rc;
+
+    state[0] = sbox_deg_7;
+    internal_linear_layer<F>(state);
+
+    if (r < NUM_INTERNAL_ROUNDS - 1) {
+      internal_rounds_s0[r] = state[0];
+    }
+  }
+
+  for (size_t i = 0; i < WIDTH; i++) {
+    ret_state[i] = state[i];
+  }
+
+  // Store sbox values if pointer is not null
+  for (size_t r = 0; r < NUM_INTERNAL_ROUNDS; r++) {
+    sbox[r] = sbox_deg_3[r];
+  }
+}
+
+template <class F>
+__SP1_HOSTDEV__ __SP1_INLINE__ void populate_perm(
+    const F input[WIDTH], F external_rounds_state[WIDTH * NUM_EXTERNAL_ROUNDS],
+    F internal_rounds_state[WIDTH],
+    F internal_rounds_s0[NUM_INTERNAL_ROUNDS - 1],
+    F external_sbox[WIDTH * NUM_EXTERNAL_ROUNDS],
+    F internal_sbox[NUM_INTERNAL_ROUNDS], F output_state[WIDTH]) {
+  for (size_t i = 0; i < WIDTH; i++) {
+    external_rounds_state[i] = input[i];
+  }
+
+  for (size_t r = 0; r < NUM_EXTERNAL_ROUNDS / 2; r++) {
+    F next_state[WIDTH];
+    populate_external_round<F>(external_rounds_state, external_sbox, r,
+                               next_state);
+    if (r == NUM_EXTERNAL_ROUNDS / 2 - 1) {
+      for (size_t i = 0; i < WIDTH; i++) {
+        internal_rounds_state[i] = next_state[i];
+      }
+    } else {
+      for (size_t i = 0; i < WIDTH; i++) {
+        external_rounds_state[(r + 1) * WIDTH + i] = next_state[i];
+      }
+    }
+  }
+
+  F ret_state[WIDTH];
+  populate_internal_rounds<F>(internal_rounds_state, internal_rounds_s0,
+                              internal_sbox, ret_state);
+  size_t row = NUM_EXTERNAL_ROUNDS / 2;
+  for (size_t i = 0; i < WIDTH; i++) {
+    external_rounds_state[row * WIDTH + i] = ret_state[i];
+  }
+
+  for (size_t r = NUM_EXTERNAL_ROUNDS / 2; r < NUM_EXTERNAL_ROUNDS; r++) {
+    F next_state[WIDTH];
+    populate_external_round<F>(external_rounds_state, external_sbox, r,
+                               next_state);
+    if (r == NUM_EXTERNAL_ROUNDS - 1) {
+      for (size_t i = 0; i < WIDTH; i++) {
+        output_state[i] = next_state[i];
+      }
+    } else {
+      for (size_t i = 0; i < WIDTH; i++) {
+        external_rounds_state[(r + 1) * WIDTH + i] = next_state[i];
+      }
+    }
+  }
+}
+
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const F input[WIDTH], F* input_row,
+                                  size_t start, size_t stride,
+                                  bool sbox_state) {
+  F external_rounds_state[WIDTH * NUM_EXTERNAL_ROUNDS];
+  F internal_rounds_state[WIDTH];
+  F internal_rounds_s0[NUM_INTERNAL_ROUNDS - 1];
+  F output_state[WIDTH];
+  F external_sbox[WIDTH * NUM_EXTERNAL_ROUNDS];
+  F internal_sbox[NUM_INTERNAL_ROUNDS];
+
+  populate_perm<F>(input, external_rounds_state, internal_rounds_state,
+                   internal_rounds_s0, external_sbox, internal_sbox,
+                   output_state);
+
+  size_t cursor = 0;
+  for (size_t i = 0; i < (WIDTH * NUM_EXTERNAL_ROUNDS); i++) {
+    input_row[start + (cursor + i) * stride] = external_rounds_state[i];
+  }
+
+  cursor += WIDTH * NUM_EXTERNAL_ROUNDS;
+  for (size_t i = 0; i < WIDTH; i++) {
+    input_row[start + (cursor + i) * stride] = internal_rounds_state[i];
+  }
+
+  cursor += WIDTH;
+  for (size_t i = 0; i < (NUM_INTERNAL_ROUNDS - 1); i++) {
+    input_row[start + (cursor + i) * stride] = internal_rounds_s0[i];
+  }
+
+  cursor += NUM_INTERNAL_ROUNDS - 1;
+  for (size_t i = 0; i < WIDTH; i++) {
+    input_row[start + (cursor + i) * stride] = output_state[i];
+  }
+
+  if (sbox_state) {
+    cursor += WIDTH;
+    for (size_t i = 0; i < (WIDTH * NUM_EXTERNAL_ROUNDS); i++) {
+      input_row[start + (cursor + i) * stride] = external_sbox[i];
+    }
+
+    cursor += WIDTH * NUM_EXTERNAL_ROUNDS;
+    for (size_t i = 0; i < NUM_INTERNAL_ROUNDS; i++) {
+      input_row[start + (cursor + i) * stride] = internal_sbox[i];
+    }
+  }
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const Poseidon2SkinnyInstr<F>& instr,
+                                  Poseidon2PreprocessedColsWide<F>& cols) {
+  for (size_t i = 0; i < WIDTH; i++) {
+    cols.input[i] = instr.addrs.input[i];
+    cols.output[i] = MemoryAccessColsChips<F>{.addr = instr.addrs.output[i],
+                                              .mult = instr.mults[i]};
+  }
+  cols.is_real_neg = F::zero() - F::one();
+}
+}  // namespace sp1_recursion_core_sys::poseidon2_wide
\ No newline at end of file
diff --git a/crates/recursion/core/include/prelude.hpp b/crates/recursion/core/include/prelude.hpp
new file mode 100644
index 0000000000..38cd08f262
--- /dev/null
+++ b/crates/recursion/core/include/prelude.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "sp1-recursion-core-sys-cbindgen.hpp"
+
+#ifndef __CUDACC__
+#define __SP1_HOSTDEV__
+#define __SP1_INLINE__ inline
+#include <array>
+
+namespace sp1_recursion_core_sys {
+template <class T, std::size_t N>
+using array_t = std::array<T, N>;
+}  // namespace sp1_recursion_core_sys
+#else
+#define __SP1_HOSTDEV__ __host__ __device__
+#define __SP1_INLINE__ __forceinline__
+#include <cuda/std/array>
+
+namespace sp1_recursion_core_sys {
+template <class T, std::size_t N>
+using array_t = cuda::std::array<T, N>;
+}  // namespace sp1_recursion_core_sys
+#endif
diff --git a/crates/recursion/core/include/public_values.hpp b/crates/recursion/core/include/public_values.hpp
new file mode 100644
index 0000000000..a4a527086a
--- /dev/null
+++ b/crates/recursion/core/include/public_values.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::public_values {
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const CommitPublicValuesEvent<F>& event,
+                                  size_t digest_idx,
+                                  PublicValuesCols<F>& cols) {
+  cols.pv_element = event.public_values.digest[digest_idx];
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const CommitPublicValuesInstr<F>& instr,
+                                  size_t digest_idx,
+                                  PublicValuesPreprocessedCols<F>& cols) {
+  cols.pv_idx[digest_idx] = F::one();
+  cols.pv_mem.addr = instr.pv_addrs.digest[digest_idx];
+  cols.pv_mem.mult = F::zero() - F::one();
+}
+}  // namespace sp1_recursion_core_sys::public_values
diff --git a/crates/recursion/core/include/select.hpp b/crates/recursion/core/include/select.hpp
new file mode 100644
index 0000000000..79285c0bb0
--- /dev/null
+++ b/crates/recursion/core/include/select.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "prelude.hpp"
+
+namespace sp1_recursion_core_sys::select {
+template <class F>
+__SP1_HOSTDEV__ void event_to_row(const SelectEvent<F>& event,
+                                  SelectCols<F>& cols) {
+  cols.vals = event;
+}
+
+template <class F>
+__SP1_HOSTDEV__ void instr_to_row(const SelectInstr<F>& instr,
+                                  SelectPreprocessedCols<F>& cols) {
+  cols.is_real = F::one();
+  cols.addrs = instr.addrs;
+  cols.mult1 = instr.mult1;
+  cols.mult2 = instr.mult2;
+}
+}  // namespace sp1_recursion_core_sys::select
diff --git a/crates/recursion/core/include/sys.hpp b/crates/recursion/core/include/sys.hpp
new file mode 100644
index 0000000000..8e96d4227c
--- /dev/null
+++ b/crates/recursion/core/include/sys.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "alu_base.hpp"
+#include "alu_ext.hpp"
+#include "batch_fri.hpp"
+#include "exp_reverse_bits.hpp"
+#include "fri_fold.hpp"
+#include "public_values.hpp"
+#include "select.hpp"
+#include "poseidon2_constants.hpp"
+#include "poseidon2.hpp"
+#include "poseidon2_skinny.hpp"
+#include "poseidon2_wide.hpp"
+#include "sp1-recursion-core-sys-cbindgen.hpp"
diff --git a/crates/recursion/core/src/air/public_values.rs b/crates/recursion/core/src/air/public_values.rs
index fd5baed415..f5c8b86ee9 100644
--- a/crates/recursion/core/src/air/public_values.rs
+++ b/crates/recursion/core/src/air/public_values.rs
@@ -7,7 +7,7 @@ use p3_symmetric::CryptographicPermutation;
 use serde::{Deserialize, Serialize};
 use sp1_core_machine::utils::indices_arr;
 use sp1_derive::AlignedBorrow;
-use sp1_stark::{air::POSEIDON_NUM_WORDS, Word, PROOF_MAX_NUM_PVS};
+use sp1_stark::{air::POSEIDON_NUM_WORDS, septic_digest::SepticDigest, Word, PROOF_MAX_NUM_PVS};
 use static_assertions::const_assert_eq;
 use std::{
     borrow::BorrowMut,
@@ -113,12 +113,6 @@ pub struct RecursionPublicValues<T> {
     /// Last MemoryFinalize address bits.
     pub last_finalize_addr_bits: [T; 32],
 
-    /// Start state of reconstruct_challenger.
-    pub start_reconstruct_challenger: ChallengerPublicValues<T>,
-
-    /// End state of reconstruct_challenger.
-    pub end_reconstruct_challenger: ChallengerPublicValues<T>,
-
     /// Start state of reconstruct_deferred_digest.
     pub start_reconstruct_deferred_digest: [T; POSEIDON_NUM_WORDS],
 
@@ -131,12 +125,9 @@ pub struct RecursionPublicValues<T> {
     /// The root of the vk merkle tree.
     pub vk_root: [T; DIGEST_SIZE],
 
-    /// The leaf challenger containing the entropy from the main trace commitment.
-    pub leaf_challenger: ChallengerPublicValues<T>,
-
-    /// Current cumulative sum of lookup bus.  Note that for recursive proofs for core proofs, this
-    /// contains the global cumulative sum.  For all other proofs, it's the local cumulative sum.
-    pub cumulative_sum: [T; 4],
+    /// Current cumulative sum of lookup bus. Note that for recursive proofs for core proofs, this
+    /// contains the global cumulative sum.  
+    pub global_cumulative_sum: SepticDigest<T>,
 
     /// Whether the proof completely proves the program execution.
     pub is_complete: T,
diff --git a/crates/recursion/core/src/chips/alu_base.rs b/crates/recursion/core/src/chips/alu_base.rs
index f587c73ce8..b4fe17c467 100644
--- a/crates/recursion/core/src/chips/alu_base.rs
+++ b/crates/recursion/core/src/chips/alu_base.rs
@@ -253,4 +253,146 @@ mod tests {
 
         run_recursion_test_machines(program);
     }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let shard = ExecutionRecord {
+            base_alu_events: vec![BaseAluIo { out: F::one(), in1: F::one(), in2: F::one() }],
+            ..Default::default()
+        };
+
+        let chip = BaseAluChip;
+        let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let events = &input.base_alu_events;
+        let nb_rows = events.len().div_ceil(NUM_BASE_ALU_ENTRIES_PER_ROW);
+        let fixed_log2_rows = input.fixed_log2_rows(&BaseAluChip);
+        let padded_nb_rows = match fixed_log2_rows {
+            Some(log2_rows) => 1 << log2_rows,
+            None => next_power_of_two(nb_rows, None),
+        };
+        let mut values = vec![F::zero(); padded_nb_rows * NUM_BASE_ALU_COLS];
+
+        let chunk_size = std::cmp::max(events.len() / num_cpus::get(), 1);
+        let populate_len = events.len() * NUM_BASE_ALU_VALUE_COLS;
+
+        values[..populate_len]
+            .par_chunks_mut(chunk_size * NUM_BASE_ALU_VALUE_COLS)
+            .enumerate()
+            .for_each(|(i, rows)| {
+                rows.chunks_mut(NUM_BASE_ALU_VALUE_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = i * chunk_size + j;
+                    if idx < events.len() {
+                        let cols: &mut BaseAluValueCols<_> = row.borrow_mut();
+                        unsafe {
+                            crate::sys::alu_base_event_to_row_babybear(&events[idx], cols);
+                        }
+                    }
+                });
+            });
+
+        RowMajorMatrix::new(values, NUM_BASE_ALU_COLS)
+    }
+
+    #[test]
+    fn generate_preprocessed_trace() {
+        type F = BabyBear;
+
+        let program = RecursionProgram {
+            instructions: vec![Instruction::BaseAlu(BaseAluInstr {
+                opcode: BaseAluOpcode::AddF,
+                mult: F::one(),
+                addrs: BaseAluIo {
+                    out: Address(F::zero()),
+                    in1: Address(F::one()),
+                    in2: Address(F::two()),
+                },
+            })],
+            ..Default::default()
+        };
+
+        let chip = BaseAluChip;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        println!("{:?}", trace.values);
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let program = RecursionProgram {
+            instructions: vec![Instruction::BaseAlu(BaseAluInstr {
+                opcode: BaseAluOpcode::AddF,
+                mult: F::one(),
+                addrs: BaseAluIo {
+                    out: Address(F::zero()),
+                    in1: Address(F::one()),
+                    in2: Address(F::two()),
+                },
+            })],
+            ..Default::default()
+        };
+
+        let chip = BaseAluChip;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let instrs = program
+            .instructions
+            .iter()
+            .filter_map(|instruction| match instruction {
+                Instruction::BaseAlu(x) => Some(x),
+                _ => None,
+            })
+            .collect::<Vec<_>>();
+
+        let nb_rows = instrs.len().div_ceil(NUM_BASE_ALU_ENTRIES_PER_ROW);
+        let fixed_log2_rows = program.fixed_log2_rows(&BaseAluChip);
+        let padded_nb_rows = match fixed_log2_rows {
+            Some(log2_rows) => 1 << log2_rows,
+            None => next_power_of_two(nb_rows, None),
+        };
+        let mut values = vec![F::zero(); padded_nb_rows * NUM_BASE_ALU_PREPROCESSED_COLS];
+
+        let chunk_size = std::cmp::max(instrs.len() / num_cpus::get(), 1);
+        let populate_len = instrs.len() * NUM_BASE_ALU_ACCESS_COLS;
+
+        values[..populate_len]
+            .par_chunks_mut(chunk_size * NUM_BASE_ALU_ACCESS_COLS)
+            .enumerate()
+            .for_each(|(i, rows)| {
+                rows.chunks_mut(NUM_BASE_ALU_ACCESS_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = i * chunk_size + j;
+                    if idx < instrs.len() {
+                        let access: &mut BaseAluAccessCols<_> = row.borrow_mut();
+                        unsafe {
+                            crate::sys::alu_base_instr_to_row_babybear(instrs[idx], access);
+                        }
+                    }
+                });
+            });
+
+        RowMajorMatrix::new(values, NUM_BASE_ALU_PREPROCESSED_COLS)
+    }
 }
diff --git a/crates/recursion/core/src/chips/alu_ext.rs b/crates/recursion/core/src/chips/alu_ext.rs
index b698a5d209..4dbe0ca605 100644
--- a/crates/recursion/core/src/chips/alu_ext.rs
+++ b/crates/recursion/core/src/chips/alu_ext.rs
@@ -265,4 +265,149 @@ mod tests {
 
         run_recursion_test_machines(program);
     }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let shard = ExecutionRecord {
+            ext_alu_events: vec![ExtAluIo {
+                out: F::one().into(),
+                in1: F::one().into(),
+                in2: F::one().into(),
+            }],
+            ..Default::default()
+        };
+
+        let chip = ExtAluChip;
+        let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let events = &input.ext_alu_events;
+        let nb_rows = events.len().div_ceil(NUM_EXT_ALU_ENTRIES_PER_ROW);
+        let fixed_log2_rows = input.fixed_log2_rows(&ExtAluChip);
+        let padded_nb_rows = match fixed_log2_rows {
+            Some(log2_rows) => 1 << log2_rows,
+            None => next_power_of_two(nb_rows, None),
+        };
+        let mut values = vec![F::zero(); padded_nb_rows * NUM_EXT_ALU_COLS];
+
+        let chunk_size = std::cmp::max(events.len() / num_cpus::get(), 1);
+        let populate_len = events.len() * NUM_EXT_ALU_VALUE_COLS;
+
+        values[..populate_len]
+            .par_chunks_mut(chunk_size * NUM_EXT_ALU_VALUE_COLS)
+            .enumerate()
+            .for_each(|(i, rows)| {
+                rows.chunks_mut(NUM_EXT_ALU_VALUE_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = i * chunk_size + j;
+                    if idx < events.len() {
+                        let cols: &mut ExtAluValueCols<_> = row.borrow_mut();
+                        unsafe {
+                            crate::sys::alu_ext_event_to_row_babybear(&events[idx], cols);
+                        }
+                    }
+                });
+            });
+
+        RowMajorMatrix::new(values, NUM_EXT_ALU_COLS)
+    }
+
+    #[test]
+    fn generate_preprocessed_trace() {
+        type F = BabyBear;
+
+        let program = RecursionProgram {
+            instructions: vec![Instruction::ExtAlu(ExtAluInstr {
+                opcode: ExtAluOpcode::AddE,
+                mult: F::one(),
+                addrs: ExtAluIo {
+                    out: Address(F::zero()),
+                    in1: Address(F::one()),
+                    in2: Address(F::two()),
+                },
+            })],
+            ..Default::default()
+        };
+        let chip = ExtAluChip;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        println!("{:?}", trace.values);
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let program = RecursionProgram {
+            instructions: vec![Instruction::ExtAlu(ExtAluInstr {
+                opcode: ExtAluOpcode::AddE,
+                mult: F::one(),
+                addrs: ExtAluIo {
+                    out: Address(F::zero()),
+                    in1: Address(F::one()),
+                    in2: Address(F::two()),
+                },
+            })],
+            ..Default::default()
+        };
+
+        let chip = ExtAluChip;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let instrs = program
+            .instructions
+            .iter()
+            .filter_map(|instruction| match instruction {
+                Instruction::ExtAlu(x) => Some(x),
+                _ => None,
+            })
+            .collect::<Vec<_>>();
+
+        let nb_rows = instrs.len().div_ceil(NUM_EXT_ALU_ENTRIES_PER_ROW);
+        let fixed_log2_rows = program.fixed_log2_rows(&ExtAluChip);
+        let padded_nb_rows = match fixed_log2_rows {
+            Some(log2_rows) => 1 << log2_rows,
+            None => next_power_of_two(nb_rows, None),
+        };
+        let mut values = vec![F::zero(); padded_nb_rows * NUM_EXT_ALU_PREPROCESSED_COLS];
+
+        let chunk_size = std::cmp::max(instrs.len() / num_cpus::get(), 1);
+        let populate_len = instrs.len() * NUM_EXT_ALU_ACCESS_COLS;
+
+        values[..populate_len]
+            .par_chunks_mut(chunk_size * NUM_EXT_ALU_ACCESS_COLS)
+            .enumerate()
+            .for_each(|(i, rows)| {
+                rows.chunks_mut(NUM_EXT_ALU_ACCESS_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = i * chunk_size + j;
+                    if idx < instrs.len() {
+                        let access: &mut ExtAluAccessCols<_> = row.borrow_mut();
+                        unsafe {
+                            crate::sys::alu_ext_instr_to_row_babybear(instrs[idx], access);
+                        }
+                    }
+                });
+            });
+
+        RowMajorMatrix::new(values, NUM_EXT_ALU_PREPROCESSED_COLS)
+    }
 }
diff --git a/crates/recursion/core/src/chips/batch_fri.rs b/crates/recursion/core/src/chips/batch_fri.rs
index 6522a9881d..db073f6479 100644
--- a/crates/recursion/core/src/chips/batch_fri.rs
+++ b/crates/recursion/core/src/chips/batch_fri.rs
@@ -72,6 +72,7 @@ impl<F: PrimeField32, const DEGREE: usize> MachineAir<F> for BatchFRIChip<DEGREE
     fn preprocessed_width(&self) -> usize {
         NUM_BATCH_FRI_PREPROCESSED_COLS
     }
+
     fn generate_preprocessed_trace(&self, program: &Self::Program) -> Option<RowMajorMatrix<F>> {
         let mut rows: Vec<[F; NUM_BATCH_FRI_PREPROCESSED_COLS]> = Vec::new();
         program
@@ -91,14 +92,14 @@ impl<F: PrimeField32, const DEGREE: usize> MachineAir<F> for BatchFRIChip<DEGREE
                 let mut row_add = vec![[F::zero(); NUM_BATCH_FRI_PREPROCESSED_COLS]; len];
                 debug_assert_eq!(*acc_mult, F::one());
 
-                row_add.iter_mut().enumerate().for_each(|(i, row)| {
+                row_add.iter_mut().enumerate().for_each(|(_i, row)| {
                     let row: &mut BatchFRIPreprocessedCols<F> = row.as_mut_slice().borrow_mut();
                     row.is_real = F::one();
-                    row.is_end = F::from_bool(i == len - 1);
+                    row.is_end = F::from_bool(_i == len - 1);
                     row.acc_addr = ext_single_addrs.acc;
-                    row.alpha_pow_addr = ext_vec_addrs.alpha_pow[i];
-                    row.p_at_z_addr = ext_vec_addrs.p_at_z[i];
-                    row.p_at_x_addr = base_vec_addrs.p_at_x[i];
+                    row.alpha_pow_addr = ext_vec_addrs.alpha_pow[_i];
+                    row.p_at_z_addr = ext_vec_addrs.p_at_z[_i];
+                    row.p_at_x_addr = base_vec_addrs.p_at_x[_i];
                 });
                 rows.extend(row_add);
             });
@@ -229,3 +230,168 @@ where
         self.eval_batch_fri::<AB>(builder, local, next, prepr_local, prepr_next);
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{BatchFRIBaseVecIo, BatchFRIEvent, BatchFRIExtSingleIo, BatchFRIExtVecIo};
+    use p3_baby_bear::BabyBear;
+    use p3_field::AbstractField;
+    use p3_matrix::dense::RowMajorMatrix;
+
+    use super::*;
+
+    #[test]
+    fn generate_trace() {
+        type F = BabyBear;
+
+        let shard = ExecutionRecord {
+            batch_fri_events: vec![BatchFRIEvent {
+                ext_single: BatchFRIExtSingleIo { acc: Block::default() },
+                ext_vec: BatchFRIExtVecIo { alpha_pow: Block::default(), p_at_z: Block::default() },
+                base_vec: BatchFRIBaseVecIo { p_at_x: F::one() },
+            }],
+            ..Default::default()
+        };
+        let chip = BatchFRIChip::<2>;
+        let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        println!("{:?}", trace.values)
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let shard = ExecutionRecord {
+            batch_fri_events: vec![BatchFRIEvent {
+                ext_single: BatchFRIExtSingleIo { acc: Block::default() },
+                ext_vec: BatchFRIExtVecIo { alpha_pow: Block::default(), p_at_z: Block::default() },
+                base_vec: BatchFRIBaseVecIo { p_at_x: F::one() },
+            }],
+            ..Default::default()
+        };
+
+        let chip = BatchFRIChip::<2>;
+        let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let events = &input.batch_fri_events;
+        let mut rows = vec![[F::zero(); NUM_BATCH_FRI_COLS]; events.len()];
+
+        let chunk_size = std::cmp::max(events.len() / num_cpus::get(), 1);
+        rows.chunks_mut(chunk_size).enumerate().for_each(|(i, chunk)| {
+            chunk.iter_mut().enumerate().for_each(|(j, row)| {
+                let idx = i * chunk_size + j;
+                if idx < events.len() {
+                    let cols: &mut BatchFRICols<F> = row.as_mut_slice().borrow_mut();
+                    unsafe {
+                        crate::sys::batch_fri_event_to_row_babybear(&events[idx], cols);
+                    }
+                }
+            });
+        });
+
+        pad_rows_fixed(
+            &mut rows,
+            || [F::zero(); NUM_BATCH_FRI_COLS],
+            input.fixed_log2_rows(&BatchFRIChip::<2>),
+        );
+
+        RowMajorMatrix::new(rows.into_iter().flatten().collect(), NUM_BATCH_FRI_COLS)
+    }
+
+    #[test]
+    fn generate_preprocessed_trace() {
+        type F = BabyBear;
+
+        let program = RecursionProgram::<F> {
+            instructions: vec![Instruction::BatchFRI(Box::new(BatchFRIInstr {
+                base_vec_addrs: BatchFRIBaseVecIo { p_at_x: vec![Address(F::zero())] },
+                ext_single_addrs: BatchFRIExtSingleIo { acc: Address(F::zero()) },
+                ext_vec_addrs: BatchFRIExtVecIo {
+                    alpha_pow: vec![Address(F::zero())],
+                    p_at_z: vec![Address(F::zero())],
+                },
+                acc_mult: F::one(),
+            }))],
+            ..Default::default()
+        };
+
+        let chip = BatchFRIChip::<2>;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        println!("{:?}", trace.values);
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let program = RecursionProgram::<F> {
+            instructions: vec![Instruction::BatchFRI(Box::new(BatchFRIInstr {
+                base_vec_addrs: BatchFRIBaseVecIo { p_at_x: vec![Address(F::zero())] },
+                ext_single_addrs: BatchFRIExtSingleIo { acc: Address(F::zero()) },
+                ext_vec_addrs: BatchFRIExtVecIo {
+                    alpha_pow: vec![Address(F::zero())],
+                    p_at_z: vec![Address(F::zero())],
+                },
+                acc_mult: F::one(),
+            }))],
+            ..Default::default()
+        };
+
+        let chip = BatchFRIChip::<2>;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let instrs = program
+            .instructions
+            .iter()
+            .filter_map(|instruction| match instruction {
+                Instruction::BatchFRI(x) => Some(x),
+                _ => None,
+            })
+            .collect::<Vec<_>>();
+
+        let mut rows = Vec::new();
+        instrs.iter().for_each(|instruction| {
+            let BatchFRIInstr { base_vec_addrs: _, ext_single_addrs: _, ext_vec_addrs, acc_mult } =
+                instruction.as_ref();
+            let len = ext_vec_addrs.p_at_z.len();
+            let mut row_add = vec![[F::zero(); NUM_BATCH_FRI_PREPROCESSED_COLS]; len];
+            debug_assert_eq!(*acc_mult, F::one());
+
+            row_add.iter_mut().for_each(|row| {
+                let cols: &mut BatchFRIPreprocessedCols<F> = row.as_mut_slice().borrow_mut();
+                unsafe {
+                    crate::sys::batch_fri_instr_to_row_babybear(&instruction.into(), cols);
+                }
+            });
+            rows.extend(row_add);
+        });
+
+        pad_rows_fixed(
+            &mut rows,
+            || [F::zero(); NUM_BATCH_FRI_PREPROCESSED_COLS],
+            program.fixed_log2_rows(&BatchFRIChip::<2>),
+        );
+
+        RowMajorMatrix::new(rows.into_iter().flatten().collect(), NUM_BATCH_FRI_PREPROCESSED_COLS)
+    }
+}
diff --git a/crates/recursion/core/src/chips/exp_reverse_bits.rs b/crates/recursion/core/src/chips/exp_reverse_bits.rs
index b5a8655236..e5c772a019 100644
--- a/crates/recursion/core/src/chips/exp_reverse_bits.rs
+++ b/crates/recursion/core/src/chips/exp_reverse_bits.rs
@@ -16,7 +16,7 @@ use crate::{
     ExpReverseBitsInstr, Instruction,
 };
 
-use super::mem::MemoryAccessCols;
+use super::mem::{MemoryAccessCols, MemoryAccessColsChips};
 
 pub const NUM_EXP_REVERSE_BITS_LEN_COLS: usize = core::mem::size_of::<ExpReverseBitsLenCols<u8>>();
 pub const NUM_EXP_REVERSE_BITS_LEN_PREPROCESSED_COLS: usize =
@@ -28,9 +28,9 @@ pub struct ExpReverseBitsLenChip<const DEGREE: usize>;
 #[derive(AlignedBorrow, Clone, Copy, Debug)]
 #[repr(C)]
 pub struct ExpReverseBitsLenPreprocessedCols<T: Copy> {
-    pub x_mem: MemoryAccessCols<T>,
-    pub exponent_mem: MemoryAccessCols<T>,
-    pub result_mem: MemoryAccessCols<T>,
+    pub x_mem: MemoryAccessColsChips<T>,
+    pub exponent_mem: MemoryAccessColsChips<T>,
+    pub result_mem: MemoryAccessColsChips<T>,
     pub iteration_num: T,
     pub is_first: T,
     pub is_last: T,
@@ -312,9 +312,12 @@ mod tests {
         machine::tests::run_recursion_test_machines,
         runtime::{instruction as instr, ExecutionRecord},
         stark::BabyBearPoseidon2Outer,
-        ExpReverseBitsEvent, Instruction, MemAccessKind, RecursionProgram,
+        Address, ExpReverseBitsEvent, ExpReverseBitsIo, Instruction, MemAccessKind,
+        RecursionProgram,
     };
 
+    use super::*;
+
     #[test]
     fn prove_babybear_circuit_erbl() {
         setup_logger();
@@ -387,4 +390,185 @@ mod tests {
         let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
         println!("{:?}", trace.values)
     }
+
+    #[test]
+    fn generate_erbl_preprocessed_trace() {
+        type F = BabyBear;
+
+        let program = RecursionProgram {
+            instructions: vec![Instruction::ExpReverseBitsLen(ExpReverseBitsInstr {
+                addrs: ExpReverseBitsIo {
+                    base: Address(F::zero()),
+                    exp: vec![Address(F::one()), Address(F::zero()), Address(F::one())],
+                    result: Address(F::from_canonical_u32(4)),
+                },
+                mult: F::one(),
+            })],
+            ..Default::default()
+        };
+
+        let chip = ExpReverseBitsLenChip::<3>;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        println!("{:?}", trace.values);
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let shard = ExecutionRecord {
+            exp_reverse_bits_len_events: vec![ExpReverseBitsEvent {
+                base: F::two(),
+                exp: vec![F::zero(), F::one(), F::one()],
+                result: F::two().exp_u64(0b110),
+            }],
+            ..Default::default()
+        };
+
+        let chip = ExpReverseBitsLenChip::<3>;
+        let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let events = &input.exp_reverse_bits_len_events;
+        let mut overall_rows = Vec::new();
+
+        let chunk_size = std::cmp::max(events.len() / num_cpus::get(), 1);
+        events.chunks(chunk_size).for_each(|chunk| {
+            chunk.iter().for_each(|event| {
+                let mut rows =
+                    vec![vec![F::zero(); NUM_EXP_REVERSE_BITS_LEN_COLS]; event.exp.len()];
+                let mut accum = F::one();
+
+                rows.iter_mut().enumerate().for_each(|(i, row)| {
+                    let cols: &mut ExpReverseBitsLenCols<F> = row.as_mut_slice().borrow_mut();
+                    unsafe {
+                        crate::sys::exp_reverse_bits_event_to_row_babybear(&event.into(), i, cols);
+                    }
+
+                    // Accumulate after the event is converted to a row
+                    let prev_accum = accum;
+                    accum = prev_accum * prev_accum * cols.multiplier;
+
+                    cols.accum = accum;
+                    cols.accum_squared = accum * accum;
+                    cols.prev_accum_squared = prev_accum * prev_accum;
+                    cols.prev_accum_squared_times_multiplier =
+                        cols.prev_accum_squared * cols.multiplier;
+                });
+
+                overall_rows.extend(rows);
+            });
+        });
+
+        pad_rows_fixed(
+            &mut overall_rows,
+            || [F::zero(); NUM_EXP_REVERSE_BITS_LEN_COLS].to_vec(),
+            input.fixed_log2_rows(&ExpReverseBitsLenChip::<3>),
+        );
+
+        RowMajorMatrix::new(
+            overall_rows.into_iter().flatten().collect(),
+            NUM_EXP_REVERSE_BITS_LEN_COLS,
+        )
+    }
+
+    #[test]
+    fn generate_preprocessed_trace() {
+        type F = BabyBear;
+
+        let program = RecursionProgram::<F> {
+            instructions: vec![Instruction::ExpReverseBitsLen(ExpReverseBitsInstr {
+                addrs: ExpReverseBitsIo {
+                    base: Address(F::zero()),
+                    exp: vec![Address(F::zero()), Address(F::one())],
+                    result: Address(F::zero()),
+                },
+                mult: F::one(),
+            })],
+            ..Default::default()
+        };
+
+        let chip = ExpReverseBitsLenChip::<3>;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        println!("{:?}", trace.values);
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let program = RecursionProgram::<F> {
+            instructions: vec![Instruction::ExpReverseBitsLen(ExpReverseBitsInstr {
+                addrs: ExpReverseBitsIo {
+                    base: Address(F::zero()),
+                    exp: vec![Address(F::zero()), Address(F::one())],
+                    result: Address(F::zero()),
+                },
+                mult: F::one(),
+            })],
+            ..Default::default()
+        };
+
+        let chip = ExpReverseBitsLenChip::<3>;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let instrs = program
+            .instructions
+            .iter()
+            .filter_map(|instruction| match instruction {
+                Instruction::ExpReverseBitsLen(x) => Some(x),
+                _ => None,
+            })
+            .collect::<Vec<_>>();
+
+        let mut rows = Vec::new();
+        instrs.iter().for_each(|instruction| {
+            let len = instruction.addrs.exp.len();
+            let mut row_add = vec![[F::zero(); NUM_EXP_REVERSE_BITS_LEN_PREPROCESSED_COLS]; len];
+
+            row_add.iter_mut().enumerate().for_each(|(i, row)| {
+                let cols: &mut ExpReverseBitsLenPreprocessedCols<F> =
+                    row.as_mut_slice().borrow_mut();
+                unsafe {
+                    crate::sys::exp_reverse_bits_instr_to_row_babybear(
+                        &(*instruction).into(),
+                        i,
+                        len,
+                        cols,
+                    );
+                }
+            });
+            rows.extend(row_add);
+        });
+
+        pad_rows_fixed(
+            &mut rows,
+            || [F::zero(); NUM_EXP_REVERSE_BITS_LEN_PREPROCESSED_COLS],
+            program.fixed_log2_rows(&ExpReverseBitsLenChip::<3>),
+        );
+
+        RowMajorMatrix::new(
+            rows.into_iter().flatten().collect(),
+            NUM_EXP_REVERSE_BITS_LEN_PREPROCESSED_COLS,
+        )
+    }
 }
diff --git a/crates/recursion/core/src/chips/fri_fold.rs b/crates/recursion/core/src/chips/fri_fold.rs
index 063037032c..41167c4a40 100644
--- a/crates/recursion/core/src/chips/fri_fold.rs
+++ b/crates/recursion/core/src/chips/fri_fold.rs
@@ -21,7 +21,7 @@ use crate::{
     ExecutionRecord, FriFoldInstr,
 };
 
-use super::mem::MemoryAccessCols;
+use super::mem::{MemoryAccessCols, MemoryAccessColsChips};
 
 pub const NUM_FRI_FOLD_COLS: usize = core::mem::size_of::<FriFoldCols<u8>>();
 pub const NUM_FRI_FOLD_PREPROCESSED_COLS: usize =
@@ -45,19 +45,19 @@ pub struct FriFoldPreprocessedCols<T: Copy> {
     pub is_first: T,
 
     // Memory accesses for the single fields.
-    pub z_mem: MemoryAccessCols<T>,
-    pub alpha_mem: MemoryAccessCols<T>,
-    pub x_mem: MemoryAccessCols<T>,
+    pub z_mem: MemoryAccessColsChips<T>,
+    pub alpha_mem: MemoryAccessColsChips<T>,
+    pub x_mem: MemoryAccessColsChips<T>,
 
     // Memory accesses for the vector field inputs.
-    pub alpha_pow_input_mem: MemoryAccessCols<T>,
-    pub ro_input_mem: MemoryAccessCols<T>,
-    pub p_at_x_mem: MemoryAccessCols<T>,
-    pub p_at_z_mem: MemoryAccessCols<T>,
+    pub alpha_pow_input_mem: MemoryAccessColsChips<T>,
+    pub ro_input_mem: MemoryAccessColsChips<T>,
+    pub p_at_x_mem: MemoryAccessColsChips<T>,
+    pub p_at_z_mem: MemoryAccessColsChips<T>,
 
     // Memory accesses for the vector field outputs.
-    pub ro_output_mem: MemoryAccessCols<T>,
-    pub alpha_pow_output_mem: MemoryAccessCols<T>,
+    pub ro_output_mem: MemoryAccessColsChips<T>,
+    pub alpha_pow_output_mem: MemoryAccessColsChips<T>,
 
     pub is_real: T,
 }
@@ -100,6 +100,7 @@ impl<F: PrimeField32, const DEGREE: usize> MachineAir<F> for FriFoldChip<DEGREE>
     fn preprocessed_width(&self) -> usize {
         NUM_FRI_FOLD_PREPROCESSED_COLS
     }
+
     fn generate_preprocessed_trace(&self, program: &Self::Program) -> Option<RowMajorMatrix<F>> {
         let mut rows: Vec<[F; NUM_FRI_FOLD_PREPROCESSED_COLS]> = Vec::new();
         program
@@ -362,13 +363,15 @@ mod tests {
     use p3_field::AbstractField;
     use p3_matrix::dense::RowMajorMatrix;
 
+    use super::*;
+
     use crate::{
         air::Block,
         chips::fri_fold::FriFoldChip,
         machine::tests::run_recursion_test_machines,
         runtime::{instruction as instr, ExecutionRecord},
         stark::BabyBearPoseidon2Outer,
-        FriFoldBaseIo, FriFoldEvent, FriFoldExtSingleIo, FriFoldExtVecIo, Instruction,
+        Address, FriFoldBaseIo, FriFoldEvent, FriFoldExtSingleIo, FriFoldExtVecIo, Instruction,
         MemAccessKind, RecursionProgram,
     };
 
@@ -545,4 +548,191 @@ mod tests {
         let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
         println!("{:?}", trace.values)
     }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let mut rng = StdRng::seed_from_u64(0xDEADBEEF);
+        let mut rng2 = StdRng::seed_from_u64(0xDEADBEEF);
+        let mut random_felt = move || -> F { F::from_canonical_u32(rng.gen_range(0..1 << 16)) };
+        let mut random_block = move || Block::from([random_felt(); 4]);
+
+        let shard = ExecutionRecord {
+            fri_fold_events: (0..17)
+                .map(|_| FriFoldEvent {
+                    base_single: FriFoldBaseIo {
+                        x: F::from_canonical_u32(rng2.gen_range(0..1 << 16)),
+                    },
+                    ext_single: FriFoldExtSingleIo { z: random_block(), alpha: random_block() },
+                    ext_vec: crate::FriFoldExtVecIo {
+                        mat_opening: random_block(),
+                        ps_at_z: random_block(),
+                        alpha_pow_input: random_block(),
+                        ro_input: random_block(),
+                        alpha_pow_output: random_block(),
+                        ro_output: random_block(),
+                    },
+                })
+                .collect(),
+            ..Default::default()
+        };
+
+        let chip = FriFoldChip::<3>::default();
+        let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let events = &input.fri_fold_events;
+        let mut rows = events.iter().map(|_| [F::zero(); NUM_FRI_FOLD_COLS]).collect_vec();
+
+        let chunk_size = std::cmp::max(events.len() / num_cpus::get(), 1);
+        rows.chunks_mut(chunk_size).enumerate().for_each(|(i, chunk)| {
+            chunk.iter_mut().enumerate().for_each(|(j, row)| {
+                let idx = i * chunk_size + j;
+                if idx < events.len() {
+                    let cols: &mut FriFoldCols<F> = row.as_mut_slice().borrow_mut();
+                    unsafe {
+                        crate::sys::fri_fold_event_to_row_babybear(&events[idx], cols);
+                    }
+                }
+            });
+        });
+
+        pad_rows_fixed(
+            &mut rows,
+            || [F::zero(); NUM_FRI_FOLD_COLS],
+            input.fixed_log2_rows(&FriFoldChip::<3>::default()),
+        );
+
+        RowMajorMatrix::new(rows.into_iter().flatten().collect(), NUM_FRI_FOLD_COLS)
+    }
+
+    #[test]
+    fn generate_preprocessed_trace() {
+        type F = BabyBear;
+
+        let mut rng = StdRng::seed_from_u64(0xDEADBEEF);
+        let mut random_addr = move || -> F { F::from_canonical_u32(rng.gen_range(0..1 << 16)) };
+
+        // Create a program with a few FriFold instructions
+        let program = RecursionProgram::<F> {
+            instructions: (0..17)
+                .map(|_| {
+                    Instruction::FriFold(Box::new(FriFoldInstr::<F> {
+                        base_single_addrs: FriFoldBaseIo { x: Address(random_addr()) },
+                        ext_single_addrs: FriFoldExtSingleIo {
+                            z: Address(random_addr()),
+                            alpha: Address(random_addr()),
+                        },
+                        ext_vec_addrs: FriFoldExtVecIo {
+                            mat_opening: vec![Address(random_addr())],
+                            ps_at_z: vec![Address(random_addr())],
+                            alpha_pow_input: vec![Address(random_addr())],
+                            ro_input: vec![Address(random_addr())],
+                            alpha_pow_output: vec![Address(random_addr())],
+                            ro_output: vec![Address(random_addr())],
+                        },
+                        alpha_pow_mults: vec![F::one()],
+                        ro_mults: vec![F::one()],
+                    }))
+                })
+                .collect(),
+            ..Default::default()
+        };
+
+        let chip = FriFoldChip::<3>::default();
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        println!("{:?}", trace.values);
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let mut rng = StdRng::seed_from_u64(0xDEADBEEF);
+        let mut random_addr = move || -> F { F::from_canonical_u32(rng.gen_range(0..1 << 16)) };
+
+        // Create a program with a few FriFold instructions
+        let program = RecursionProgram::<F> {
+            instructions: (0..17)
+                .map(|_| {
+                    Instruction::FriFold(Box::new(FriFoldInstr::<F> {
+                        base_single_addrs: FriFoldBaseIo { x: Address(random_addr()) },
+                        ext_single_addrs: FriFoldExtSingleIo {
+                            z: Address(random_addr()),
+                            alpha: Address(random_addr()),
+                        },
+                        ext_vec_addrs: FriFoldExtVecIo {
+                            mat_opening: vec![Address(random_addr())],
+                            ps_at_z: vec![Address(random_addr())],
+                            alpha_pow_input: vec![Address(random_addr())],
+                            ro_input: vec![Address(random_addr())],
+                            alpha_pow_output: vec![Address(random_addr())],
+                            ro_output: vec![Address(random_addr())],
+                        },
+                        alpha_pow_mults: vec![F::one()],
+                        ro_mults: vec![F::one()],
+                    }))
+                })
+                .collect(),
+            ..Default::default()
+        };
+
+        let chip = FriFoldChip::<3>::default();
+        let trace_rust = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace_rust);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let mut rows = Vec::new();
+        program
+            .instructions
+            .iter()
+            .filter_map(|instruction| {
+                if let Instruction::FriFold(instr) = instruction {
+                    Some(instr)
+                } else {
+                    None
+                }
+            })
+            .for_each(|instruction| {
+                let mut row_add = vec![
+                    [F::zero(); NUM_FRI_FOLD_PREPROCESSED_COLS];
+                    instruction.ext_vec_addrs.ps_at_z.len()
+                ];
+
+                row_add.iter_mut().enumerate().for_each(|(row_idx, row)| {
+                    let cols: &mut FriFoldPreprocessedCols<F> = row.as_mut_slice().borrow_mut();
+                    unsafe {
+                        crate::sys::fri_fold_instr_to_row_babybear(
+                            &instruction.into(),
+                            row_idx,
+                            cols,
+                        );
+                    }
+                });
+
+                rows.extend(row_add);
+            });
+
+        pad_rows_fixed(&mut rows, || [F::zero(); NUM_FRI_FOLD_PREPROCESSED_COLS], None);
+
+        RowMajorMatrix::new(rows.into_iter().flatten().collect(), NUM_FRI_FOLD_PREPROCESSED_COLS)
+    }
 }
diff --git a/crates/recursion/core/src/chips/mem/mod.rs b/crates/recursion/core/src/chips/mem/mod.rs
index f318db027a..cace2a1026 100644
--- a/crates/recursion/core/src/chips/mem/mod.rs
+++ b/crates/recursion/core/src/chips/mem/mod.rs
@@ -13,10 +13,13 @@ pub const NUM_MEM_ACCESS_COLS: usize = core::mem::size_of::<MemoryAccessCols<u8>
 /// Data describing in what manner to access a particular memory block.
 #[derive(AlignedBorrow, Debug, Clone, Copy)]
 #[repr(C)]
-pub struct MemoryAccessCols<F: Copy> {
+pub struct MemoryAccessColsChips<F: Copy> {
     /// The address to access.
     pub addr: Address<F>,
     /// The multiplicity which to read/write.
     /// "Positive" values indicate a write, and "negative" values indicate a read.
     pub mult: F,
 }
+
+/// Avoids cbindgen naming collisions.
+pub type MemoryAccessCols<F> = MemoryAccessColsChips<F>;
diff --git a/crates/recursion/core/src/chips/mem/variable.rs b/crates/recursion/core/src/chips/mem/variable.rs
index 98ae999ba4..853984bbbf 100644
--- a/crates/recursion/core/src/chips/mem/variable.rs
+++ b/crates/recursion/core/src/chips/mem/variable.rs
@@ -1,5 +1,5 @@
 use core::borrow::Borrow;
-use instruction::{HintBitsInstr, HintExt2FeltsInstr, HintInstr};
+use instruction::{HintAddCurveInstr, HintBitsInstr, HintExt2FeltsInstr, HintInstr};
 use p3_air::{Air, BaseAir, PairBuilder};
 use p3_field::PrimeField32;
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
@@ -70,6 +70,13 @@ impl<F: PrimeField32> MachineAir<F> for MemoryChip<F> {
                     output_addrs_mults,
                     input_addr: _, // No receive interaction for the hint operation
                 }) => output_addrs_mults.iter().collect(),
+                Instruction::HintAddCurve(instr) => {
+                    let HintAddCurveInstr {
+                        output_x_addrs_mults,
+                        output_y_addrs_mults, .. // No receive interaction for the hint operation
+                    } = instr.as_ref();
+                    output_x_addrs_mults.iter().chain(output_y_addrs_mults.iter()).collect()
+                }
                 _ => vec![],
             })
             .collect::<Vec<_>>();
diff --git a/crates/recursion/core/src/chips/poseidon2_skinny/columns/mod.rs b/crates/recursion/core/src/chips/poseidon2_skinny/columns/mod.rs
index 7338082179..a8080c7a35 100644
--- a/crates/recursion/core/src/chips/poseidon2_skinny/columns/mod.rs
+++ b/crates/recursion/core/src/chips/poseidon2_skinny/columns/mod.rs
@@ -14,10 +14,12 @@ const fn make_col_map_degree9() -> Poseidon2<usize> {
 }
 pub const POSEIDON2_DEGREE9_COL_MAP: Poseidon2<usize> = make_col_map_degree9();
 
+pub const NUM_INTERNAL_ROUNDS_S0: usize = NUM_INTERNAL_ROUNDS - 1;
+
 /// Struct for the poseidon2 skinny non preprocessed column.
 #[derive(AlignedBorrow, Clone, Copy)]
 #[repr(C)]
 pub struct Poseidon2<T: Copy> {
     pub state_var: [T; WIDTH],
-    pub internal_rounds_s0: [T; NUM_INTERNAL_ROUNDS - 1],
+    pub internal_rounds_s0: [T; NUM_INTERNAL_ROUNDS_S0],
 }
diff --git a/crates/recursion/core/src/chips/poseidon2_skinny/columns/preprocessed.rs b/crates/recursion/core/src/chips/poseidon2_skinny/columns/preprocessed.rs
index 02f5e041a8..90a857255b 100644
--- a/crates/recursion/core/src/chips/poseidon2_skinny/columns/preprocessed.rs
+++ b/crates/recursion/core/src/chips/poseidon2_skinny/columns/preprocessed.rs
@@ -1,6 +1,6 @@
 use sp1_derive::AlignedBorrow;
 
-use crate::chips::{mem::MemoryAccessCols, poseidon2_skinny::WIDTH};
+use crate::chips::{mem::MemoryAccessColsChips, poseidon2_skinny::WIDTH};
 
 #[derive(AlignedBorrow, Clone, Copy, Debug)]
 #[repr(C)]
@@ -13,7 +13,9 @@ pub struct RoundCountersPreprocessedCols<T: Copy> {
 
 #[derive(AlignedBorrow, Clone, Copy, Debug)]
 #[repr(C)]
-pub struct Poseidon2PreprocessedCols<T: Copy> {
-    pub memory_preprocessed: [MemoryAccessCols<T>; WIDTH],
+pub struct Poseidon2PreprocessedColsSkinny<T: Copy> {
+    pub memory_preprocessed: [MemoryAccessColsChips<T>; WIDTH],
     pub round_counters_preprocessed: RoundCountersPreprocessedCols<T>,
 }
+
+pub type Poseidon2PreprocessedCols<T> = Poseidon2PreprocessedColsSkinny<T>;
diff --git a/crates/recursion/core/src/chips/poseidon2_skinny/trace.rs b/crates/recursion/core/src/chips/poseidon2_skinny/trace.rs
index 7e67f54362..794fc00d79 100644
--- a/crates/recursion/core/src/chips/poseidon2_skinny/trace.rs
+++ b/crates/recursion/core/src/chips/poseidon2_skinny/trace.rs
@@ -30,7 +30,7 @@ const PREPROCESSED_POSEIDON2_WIDTH: usize = size_of::<Poseidon2PreprocessedCols<
 
 const INTERNAL_ROUND_IDX: usize = NUM_EXTERNAL_ROUNDS / 2 + 1;
 const INPUT_ROUND_IDX: usize = 0;
-const OUTPUT_ROUND_IDX: usize = NUM_EXTERNAL_ROUNDS + 2;
+pub const OUTPUT_ROUND_IDX: usize = NUM_EXTERNAL_ROUNDS + 2;
 
 impl<F: PrimeField32, const DEGREE: usize> MachineAir<F> for Poseidon2SkinnyChip<DEGREE> {
     type Record = ExecutionRecord<F>;
@@ -276,9 +276,11 @@ mod tests {
 
     use crate::{
         chips::poseidon2_skinny::{Poseidon2SkinnyChip, WIDTH},
-        ExecutionRecord, Poseidon2Event,
+        Address, ExecutionRecord, Poseidon2Event, Poseidon2Instr, Poseidon2Io,
     };
 
+    use super::*;
+
     #[test]
     fn generate_trace() {
         type F = BabyBear;
@@ -299,4 +301,146 @@ mod tests {
         let chip_9 = Poseidon2SkinnyChip::<9>::default();
         let _: RowMajorMatrix<F> = chip_9.generate_trace(&shard, &mut ExecutionRecord::default());
     }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+        let input_0 = [F::one(); WIDTH];
+        let permuter = inner_perm();
+        let output_0 = permuter.permute(input_0);
+        let mut rng = rand::thread_rng();
+
+        let input_1 = [F::rand(&mut rng); WIDTH];
+        let output_1 = permuter.permute(input_1);
+        let shard = ExecutionRecord {
+            poseidon2_events: vec![
+                Poseidon2Event { input: input_0, output: output_0 },
+                Poseidon2Event { input: input_1, output: output_1 },
+            ],
+            ..Default::default()
+        };
+
+        let chip = Poseidon2SkinnyChip::<9>::default();
+        let trace_rust = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace_rust);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+        let mut rows = Vec::new();
+
+        for event in &input.poseidon2_events {
+            // We have one row for input, one row for output, NUM_EXTERNAL_ROUNDS rows for the
+            // external rounds, and one row for all internal rounds.
+            let mut row_add = [[F::zero(); NUM_POSEIDON2_COLS]; NUM_EXTERNAL_ROUNDS + 3];
+            let cols_ptr = row_add.as_mut_ptr() as *mut Poseidon2Cols<BabyBear>;
+            unsafe {
+                crate::sys::poseidon2_skinny_event_to_row_babybear(event, cols_ptr);
+            }
+
+            rows.extend(row_add.into_iter());
+        }
+
+        // Pad the trace to a power of two.
+        pad_rows_fixed(
+            &mut rows,
+            || [F::zero(); NUM_POSEIDON2_COLS],
+            input.fixed_log2_rows(&Poseidon2SkinnyChip::<9>::default()),
+        );
+
+        // Convert the trace to a row major matrix.
+        RowMajorMatrix::new(rows.into_iter().flatten().collect(), NUM_POSEIDON2_COLS)
+    }
+
+    #[test]
+    fn generate_preprocessed_trace() {
+        type F = BabyBear;
+
+        let program = RecursionProgram::<BabyBear> {
+            instructions: vec![Poseidon2(Box::new(Poseidon2Instr {
+                addrs: Poseidon2Io {
+                    input: [Address(F::one()); WIDTH],
+                    output: [Address(F::two()); WIDTH],
+                },
+                mults: [F::one(); WIDTH],
+            }))],
+            ..Default::default()
+        };
+
+        let chip_9 = Poseidon2SkinnyChip::<9>::default();
+        let preprocessed: Option<RowMajorMatrix<F>> = chip_9.generate_preprocessed_trace(&program);
+        assert!(preprocessed.is_some());
+    }
+
+    // ... existing code ...
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let program = RecursionProgram::<BabyBear> {
+            instructions: vec![Poseidon2(Box::new(Poseidon2Instr {
+                addrs: Poseidon2Io {
+                    input: [Address(F::one()); WIDTH],
+                    output: [Address(F::two()); WIDTH],
+                },
+                mults: [F::one(); WIDTH],
+            }))],
+            ..Default::default()
+        };
+
+        let chip = Poseidon2SkinnyChip::<9>::default();
+        let trace_rust = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace_rust);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let instructions =
+            program.instructions.iter().filter_map(|instruction| match instruction {
+                Poseidon2(instr) => Some(instr),
+                _ => None,
+            });
+
+        let num_instructions = instructions.clone().count();
+
+        let mut rows = vec![
+            [F::zero(); PREPROCESSED_POSEIDON2_WIDTH];
+            num_instructions * (NUM_EXTERNAL_ROUNDS + 3)
+        ];
+
+        instructions.zip_eq(&rows.iter_mut().chunks(NUM_EXTERNAL_ROUNDS + 3)).for_each(
+            |(instruction, row_add)| {
+                row_add.into_iter().enumerate().for_each(|(i, row)| {
+                    let cols: &mut Poseidon2PreprocessedCols<_> =
+                        (*row).as_mut_slice().borrow_mut();
+                    unsafe {
+                        crate::sys::poseidon2_skinny_instr_to_row_babybear(instruction, i, cols);
+                    }
+                });
+            },
+        );
+
+        pad_rows_fixed(
+            &mut rows,
+            || [F::zero(); PREPROCESSED_POSEIDON2_WIDTH],
+            program.fixed_log2_rows(&Poseidon2SkinnyChip::<9>::default()),
+        );
+
+        RowMajorMatrix::new(
+            rows.into_iter().flatten().collect::<Vec<_>>(),
+            PREPROCESSED_POSEIDON2_WIDTH,
+        )
+    }
 }
diff --git a/crates/recursion/core/src/chips/poseidon2_wide/columns/preprocessed.rs b/crates/recursion/core/src/chips/poseidon2_wide/columns/preprocessed.rs
index 41ec59cd76..47480ba4ef 100644
--- a/crates/recursion/core/src/chips/poseidon2_wide/columns/preprocessed.rs
+++ b/crates/recursion/core/src/chips/poseidon2_wide/columns/preprocessed.rs
@@ -1,14 +1,16 @@
 use sp1_derive::AlignedBorrow;
 
 use crate::{
-    chips::{mem::MemoryAccessCols, poseidon2_wide::WIDTH},
+    chips::{mem::MemoryAccessColsChips, poseidon2_wide::WIDTH},
     Address,
 };
 
 #[derive(AlignedBorrow, Clone, Copy, Debug)]
 #[repr(C)]
-pub struct Poseidon2PreprocessedCols<T: Copy> {
+pub struct Poseidon2PreprocessedColsWide<T: Copy> {
     pub input: [Address<T>; WIDTH],
-    pub output: [MemoryAccessCols<T>; WIDTH],
+    pub output: [MemoryAccessColsChips<T>; WIDTH],
     pub is_real_neg: T,
 }
+
+pub type Poseidon2PreprocessedCols<T> = Poseidon2PreprocessedColsWide<T>;
diff --git a/crates/recursion/core/src/chips/poseidon2_wide/trace.rs b/crates/recursion/core/src/chips/poseidon2_wide/trace.rs
index e13717bfae..90f8fb7d56 100644
--- a/crates/recursion/core/src/chips/poseidon2_wide/trace.rs
+++ b/crates/recursion/core/src/chips/poseidon2_wide/trace.rs
@@ -288,9 +288,11 @@ mod tests {
 
     use crate::{
         chips::poseidon2_wide::{Poseidon2WideChip, WIDTH},
-        ExecutionRecord, Poseidon2Event,
+        Address, ExecutionRecord, Poseidon2Event, Poseidon2Instr, Poseidon2Io,
     };
 
+    use super::*;
+
     #[test]
     fn generate_trace_deg_3() {
         type F = BabyBear;
@@ -334,4 +336,211 @@ mod tests {
         let chip_9 = Poseidon2WideChip::<9>;
         let _: RowMajorMatrix<F> = chip_9.generate_trace(&shard, &mut ExecutionRecord::default());
     }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+        let input_0 = [F::one(); WIDTH];
+        let permuter = inner_perm();
+        let output_0 = permuter.permute(input_0);
+        let mut rng = rand::thread_rng();
+
+        let input_1 = [F::rand(&mut rng); WIDTH];
+        let output_1 = permuter.permute(input_1);
+
+        let shard = ExecutionRecord {
+            poseidon2_events: vec![
+                Poseidon2Event { input: input_0, output: output_0 },
+                Poseidon2Event { input: input_1, output: output_1 },
+            ],
+            ..Default::default()
+        };
+
+        let chip = Poseidon2WideChip::<9>;
+        let trace_rust = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace_rust);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+        let padded_nb_rows = match input.fixed_log2_rows(&Poseidon2WideChip::<9>) {
+            Some(log2_rows) => 1 << log2_rows,
+            None => next_power_of_two(input.poseidon2_events.len(), None),
+        };
+        let num_columns = <Poseidon2WideChip<9> as BaseAir<F>>::width(&Poseidon2WideChip::<9>);
+        let mut values = vec![F::zero(); padded_nb_rows * num_columns];
+
+        let populate_len = input.poseidon2_events.len() * num_columns;
+        let (values_pop, values_dummy) = values.split_at_mut(populate_len);
+
+        join(
+            || {
+                values_pop
+                    .par_chunks_mut(num_columns)
+                    .zip_eq(&input.poseidon2_events)
+                    .for_each(|(row, event)| populate_perm_ffi::<9>(&event.input, row))
+            },
+            || {
+                let mut dummy_row = vec![F::zero(); num_columns];
+                populate_perm_ffi::<9>(&[F::zero(); WIDTH], &mut dummy_row);
+                values_dummy
+                    .par_chunks_mut(num_columns)
+                    .for_each(|row| row.copy_from_slice(&dummy_row))
+            },
+        );
+
+        RowMajorMatrix::new(values, num_columns)
+    }
+
+    #[cfg(feature = "sys")]
+    fn populate_perm_ffi<const DEGREE: usize>(
+        input: &[BabyBear; WIDTH],
+        input_row: &mut [BabyBear],
+    ) {
+        let permutation = permutation_mut::<BabyBear, DEGREE>(input_row);
+
+        let (
+            external_rounds_state,
+            internal_rounds_state,
+            internal_rounds_s0,
+            mut external_sbox,
+            mut internal_sbox,
+            output_state,
+        ) = permutation.get_cols_mut();
+
+        // Create temporary arrays with the correct types
+        let mut ext_rounds = [[BabyBear::zero(); WIDTH]; NUM_EXTERNAL_ROUNDS];
+        for (dst, src) in ext_rounds.iter_mut().zip(external_rounds_state.iter()) {
+            *dst = *src;
+        }
+
+        // Handle external_sbox - create temporary array only if Some
+        let mut ext_sbox = [[BabyBear::zero(); NUM_EXTERNAL_ROUNDS]; WIDTH];
+        if let Some(sbox) = external_sbox.as_mut() {
+            for i in 0..WIDTH {
+                for j in 0..NUM_EXTERNAL_ROUNDS {
+                    ext_sbox[i][j] = sbox[j][i];
+                }
+            }
+        }
+
+        // Create temporary array for internal_sbox only if Some
+        let mut int_sbox = [BabyBear::zero(); NUM_INTERNAL_ROUNDS];
+        if let Some(sbox) = internal_sbox.as_mut() {
+            int_sbox.copy_from_slice(sbox.as_slice());
+        }
+
+        unsafe {
+            crate::sys::poseidon2_wide_event_to_row_babybear(
+                input,
+                ext_rounds.as_mut_ptr() as *mut _,
+                internal_rounds_state,
+                internal_rounds_s0,
+                if external_sbox.is_some() { &mut ext_sbox } else { std::ptr::null_mut() },
+                if internal_sbox.is_some() { &mut int_sbox } else { std::ptr::null_mut() },
+                output_state,
+            );
+
+            // Copy back the results if needed
+            for (dst, src) in external_rounds_state.iter_mut().zip(ext_rounds.iter()) {
+                *dst = *src;
+            }
+
+            // Copy back external_sbox results if needed
+            if let Some(sbox) = external_sbox.as_mut() {
+                for i in 0..WIDTH {
+                    for j in 0..NUM_EXTERNAL_ROUNDS {
+                        sbox[j][i] = ext_sbox[i][j];
+                    }
+                }
+            }
+
+            // Copy back internal_sbox results if needed
+            if let Some(sbox) = internal_sbox.as_mut() {
+                sbox.copy_from_slice(&int_sbox);
+            }
+        }
+    }
+
+    #[test]
+    fn generate_preprocessed_trace() {
+        type F = BabyBear;
+
+        let program = RecursionProgram::<BabyBear> {
+            instructions: vec![Poseidon2(Box::new(Poseidon2Instr {
+                addrs: Poseidon2Io {
+                    input: [Address(F::one()); WIDTH],
+                    output: [Address(F::two()); WIDTH],
+                },
+                mults: [F::one(); WIDTH],
+            }))],
+            ..Default::default()
+        };
+
+        let chip_9 = Poseidon2WideChip::<9>;
+        let preprocessed: Option<RowMajorMatrix<F>> = chip_9.generate_preprocessed_trace(&program);
+        assert!(preprocessed.is_some());
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let program = RecursionProgram::<BabyBear> {
+            instructions: vec![Poseidon2(Box::new(Poseidon2Instr {
+                addrs: Poseidon2Io {
+                    input: [Address(F::one()); WIDTH],
+                    output: [Address(F::two()); WIDTH],
+                },
+                mults: [F::one(); WIDTH],
+            }))],
+            ..Default::default()
+        };
+
+        let chip = Poseidon2WideChip::<9>;
+        let trace_rust = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace_rust);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let instrs = program
+            .instructions
+            .iter()
+            .filter_map(|instruction| match instruction {
+                Poseidon2(instr) => Some(instr.as_ref()),
+                _ => None,
+            })
+            .collect::<Vec<_>>();
+
+        let padded_nb_rows = match program.fixed_log2_rows(&Poseidon2WideChip::<9>) {
+            Some(log2_rows) => 1 << log2_rows,
+            None => next_power_of_two(instrs.len(), None),
+        };
+        let mut values = vec![F::zero(); padded_nb_rows * PREPROCESSED_POSEIDON2_WIDTH];
+
+        let populate_len = instrs.len() * PREPROCESSED_POSEIDON2_WIDTH;
+        values[..populate_len]
+            .par_chunks_mut(PREPROCESSED_POSEIDON2_WIDTH)
+            .zip_eq(instrs)
+            .for_each(|(row, instr)| {
+                let cols: &mut Poseidon2PreprocessedCols<_> = row.borrow_mut();
+                unsafe {
+                    crate::sys::poseidon2_wide_instr_to_row_babybear(instr, cols);
+                }
+            });
+
+        RowMajorMatrix::new(values, PREPROCESSED_POSEIDON2_WIDTH)
+    }
 }
diff --git a/crates/recursion/core/src/chips/public_values.rs b/crates/recursion/core/src/chips/public_values.rs
index e81ed89758..c2c6a9450f 100644
--- a/crates/recursion/core/src/chips/public_values.rs
+++ b/crates/recursion/core/src/chips/public_values.rs
@@ -16,7 +16,7 @@ use crate::{
 
 use crate::DIGEST_SIZE;
 
-use super::mem::MemoryAccessCols;
+use super::mem::{MemoryAccessCols, MemoryAccessColsChips};
 
 pub const NUM_PUBLIC_VALUES_COLS: usize = core::mem::size_of::<PublicValuesCols<u8>>();
 pub const NUM_PUBLIC_VALUES_PREPROCESSED_COLS: usize =
@@ -32,7 +32,7 @@ pub struct PublicValuesChip;
 #[repr(C)]
 pub struct PublicValuesPreprocessedCols<T: Copy> {
     pub pv_idx: [T; DIGEST_SIZE],
-    pub pv_mem: MemoryAccessCols<T>,
+    pub pv_mem: MemoryAccessColsChips<T>,
 }
 
 /// The cols for a CommitPVHash invocation.
@@ -188,6 +188,8 @@ mod tests {
     use p3_field::AbstractField;
     use p3_matrix::dense::RowMajorMatrix;
 
+    use super::*;
+
     use crate::{
         air::{RecursionPublicValues, NUM_PV_ELMS_TO_HASH, RECURSIVE_PROOF_NUM_PV_ELTS},
         chips::public_values::PublicValuesChip,
@@ -249,4 +251,148 @@ mod tests {
         let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
         println!("{:?}", trace.values)
     }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let mut rng = StdRng::seed_from_u64(0xDEADBEEF);
+        let random_felts: [F; RECURSIVE_PROOF_NUM_PV_ELTS] =
+            array::from_fn(|_| F::from_canonical_u32(rng.gen_range(0..1 << 16)));
+        let random_public_values: &RecursionPublicValues<F> = random_felts.as_slice().borrow();
+
+        let shard = ExecutionRecord {
+            commit_pv_hash_events: vec![CommitPublicValuesEvent {
+                public_values: *random_public_values,
+            }],
+            ..Default::default()
+        };
+
+        let chip = PublicValuesChip;
+        let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        if input.commit_pv_hash_events.len() != 1 {
+            tracing::warn!("Expected exactly one CommitPVHash event.");
+        }
+
+        let mut rows: Vec<[F; NUM_PUBLIC_VALUES_COLS]> = Vec::new();
+
+        // We only take 1 commit pv hash instruction, since our air only checks for one public
+        // values hash.
+        for event in input.commit_pv_hash_events.iter().take(1) {
+            for i in 0..DIGEST_SIZE {
+                let mut row = [F::zero(); NUM_PUBLIC_VALUES_COLS];
+                let cols: &mut PublicValuesCols<F> = row.as_mut_slice().borrow_mut();
+                unsafe {
+                    crate::sys::public_values_event_to_row_babybear(event, i, cols);
+                }
+                rows.push(row);
+            }
+        }
+
+        // Pad the trace to 8 rows.
+        pad_rows_fixed(
+            &mut rows,
+            || [F::zero(); NUM_PUBLIC_VALUES_COLS],
+            Some(PUB_VALUES_LOG_HEIGHT),
+        );
+
+        RowMajorMatrix::new(rows.into_iter().flatten().collect(), NUM_PUBLIC_VALUES_COLS)
+    }
+
+    #[test]
+    fn generate_public_values_preprocessed_trace() {
+        type F = BabyBear;
+
+        let addr = 0u32;
+        let public_values_a: [u32; RECURSIVE_PROOF_NUM_PV_ELTS] =
+            array::from_fn(|i| i as u32 + addr);
+        let public_values: &RecursionPublicValues<u32> = public_values_a.as_slice().borrow();
+
+        let program = RecursionProgram::<F> {
+            instructions: vec![instr::commit_public_values(public_values)],
+            ..Default::default()
+        };
+
+        let chip = PublicValuesChip;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        println!("{:?}", trace.values);
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        let addr = 0u32;
+        let public_values_a: [u32; RECURSIVE_PROOF_NUM_PV_ELTS] =
+            array::from_fn(|i| i as u32 + addr);
+        let public_values: &RecursionPublicValues<u32> = public_values_a.as_slice().borrow();
+
+        let program = RecursionProgram {
+            instructions: vec![instr::commit_public_values(public_values)],
+            ..Default::default()
+        };
+
+        let chip = PublicValuesChip;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let mut rows: Vec<[F; NUM_PUBLIC_VALUES_PREPROCESSED_COLS]> = Vec::new();
+        let commit_pv_hash_instrs = program
+            .instructions
+            .iter()
+            .filter_map(|instruction| {
+                if let Instruction::CommitPublicValues(instr) = instruction {
+                    Some(instr)
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+
+        if commit_pv_hash_instrs.len() != 1 {
+            tracing::warn!("Expected exactly one CommitPVHash instruction.");
+        }
+
+        // We only take 1 commit pv hash instruction
+        for instr in commit_pv_hash_instrs.iter().take(1) {
+            for i in 0..DIGEST_SIZE {
+                let mut row = [F::zero(); NUM_PUBLIC_VALUES_PREPROCESSED_COLS];
+                let cols: &mut PublicValuesPreprocessedCols<F> = row.as_mut_slice().borrow_mut();
+                unsafe {
+                    crate::sys::public_values_instr_to_row_babybear(instr, i, cols);
+                }
+                rows.push(row);
+            }
+        }
+
+        // Pad the preprocessed rows to 8 rows
+        pad_rows_fixed(
+            &mut rows,
+            || [F::zero(); NUM_PUBLIC_VALUES_PREPROCESSED_COLS],
+            Some(PUB_VALUES_LOG_HEIGHT),
+        );
+
+        RowMajorMatrix::new(
+            rows.into_iter().flatten().collect(),
+            NUM_PUBLIC_VALUES_PREPROCESSED_COLS,
+        )
+    }
 }
diff --git a/crates/recursion/core/src/chips/select.rs b/crates/recursion/core/src/chips/select.rs
index d1c44d9b94..82231306ed 100644
--- a/crates/recursion/core/src/chips/select.rs
+++ b/crates/recursion/core/src/chips/select.rs
@@ -1,7 +1,6 @@
 use core::borrow::Borrow;
 use p3_air::{Air, BaseAir, PairBuilder};
-use p3_field::AbstractField;
-use p3_field::{Field, PrimeField32};
+use p3_field::{AbstractField, Field, PrimeField32};
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use p3_maybe_rayon::prelude::*;
 use sp1_core_machine::utils::next_power_of_two;
@@ -229,4 +228,168 @@ mod tests {
 
         run_recursion_test_machines(program);
     }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let shard = ExecutionRecord {
+            select_events: vec![SelectIo {
+                bit: F::one(),
+                out1: F::from_canonical_u32(5),
+                out2: F::from_canonical_u32(3),
+                in1: F::from_canonical_u32(3),
+                in2: F::from_canonical_u32(5),
+            }],
+            ..Default::default()
+        };
+
+        let chip = SelectChip;
+        let trace: RowMajorMatrix<F> = chip.generate_trace(&shard, &mut ExecutionRecord::default());
+        let trace_ffi = generate_trace_ffi(&shard);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_trace_ffi(input: &ExecutionRecord<BabyBear>) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let events = &input.select_events;
+        let nb_rows = events.len();
+        let fixed_log2_rows = input.fixed_log2_rows(&SelectChip);
+        let padded_nb_rows = match fixed_log2_rows {
+            Some(log2_rows) => 1 << log2_rows,
+            None => next_power_of_two(nb_rows, None),
+        };
+        let mut values = vec![F::zero(); padded_nb_rows * SELECT_COLS];
+
+        let chunk_size = std::cmp::max(events.len() / num_cpus::get(), 1);
+        let populate_len = events.len() * SELECT_COLS;
+
+        values[..populate_len].par_chunks_mut(chunk_size * SELECT_COLS).enumerate().for_each(
+            |(i, rows)| {
+                rows.chunks_mut(SELECT_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = i * chunk_size + j;
+                    if idx < events.len() {
+                        let cols: &mut SelectCols<_> = row.borrow_mut();
+                        unsafe {
+                            crate::sys::select_event_to_row_babybear(&events[idx], cols);
+                        }
+                    }
+                });
+            },
+        );
+
+        RowMajorMatrix::new(values, SELECT_COLS)
+    }
+
+    #[test]
+    fn generate_preprocessed_trace() {
+        type F = BabyBear;
+
+        let program = RecursionProgram {
+            instructions: vec![
+                Instruction::Select(SelectInstr {
+                    addrs: SelectIo {
+                        bit: Address(F::zero()),
+                        out1: Address(F::one()),
+                        out2: Address(F::from_canonical_u32(2)),
+                        in1: Address(F::from_canonical_u32(3)),
+                        in2: Address(F::from_canonical_u32(4)),
+                    },
+                    mult1: F::one(),
+                    mult2: F::one(),
+                }),
+                Instruction::Select(SelectInstr {
+                    addrs: SelectIo {
+                        bit: Address(F::from_canonical_u32(5)),
+                        out1: Address(F::from_canonical_u32(6)),
+                        out2: Address(F::from_canonical_u32(7)),
+                        in1: Address(F::from_canonical_u32(8)),
+                        in2: Address(F::from_canonical_u32(9)),
+                    },
+                    mult1: F::one(),
+                    mult2: F::one(),
+                }),
+            ],
+            ..Default::default()
+        };
+
+        let chip = SelectChip;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        println!("{:?}", trace.values);
+    }
+
+    #[cfg(feature = "sys")]
+    #[test]
+    fn test_generate_preprocessed_trace_ffi_eq_rust() {
+        type F = BabyBear;
+
+        let program = RecursionProgram {
+            instructions: vec![Instruction::Select(SelectInstr {
+                addrs: SelectIo {
+                    bit: Address(F::zero()),
+                    out1: Address(F::one()),
+                    out2: Address(F::from_canonical_u32(2)),
+                    in1: Address(F::from_canonical_u32(3)),
+                    in2: Address(F::from_canonical_u32(4)),
+                },
+                mult1: F::one(),
+                mult2: F::one(),
+            })],
+            ..Default::default()
+        };
+
+        let chip = SelectChip;
+        let trace = chip.generate_preprocessed_trace(&program).unwrap();
+        let trace_ffi = generate_preprocessed_trace_ffi(&program);
+
+        assert_eq!(trace_ffi, trace);
+    }
+
+    #[cfg(feature = "sys")]
+    fn generate_preprocessed_trace_ffi(
+        program: &RecursionProgram<BabyBear>,
+    ) -> RowMajorMatrix<BabyBear> {
+        type F = BabyBear;
+
+        let instrs = program
+            .instructions
+            .iter()
+            .filter_map(|instruction| match instruction {
+                Instruction::Select(x) => Some(x),
+                _ => None,
+            })
+            .collect::<Vec<_>>();
+
+        let nb_rows = instrs.len();
+        let fixed_log2_rows = program.fixed_log2_rows(&SelectChip);
+        let padded_nb_rows = match fixed_log2_rows {
+            Some(log2_rows) => 1 << log2_rows,
+            None => next_power_of_two(nb_rows, None),
+        };
+        let mut values = vec![F::zero(); padded_nb_rows * SELECT_PREPROCESSED_COLS];
+
+        let chunk_size = std::cmp::max(instrs.len() / num_cpus::get(), 1);
+        let populate_len = instrs.len() * SELECT_PREPROCESSED_COLS;
+
+        values[..populate_len]
+            .par_chunks_mut(chunk_size * SELECT_PREPROCESSED_COLS)
+            .enumerate()
+            .for_each(|(i, rows)| {
+                rows.chunks_mut(SELECT_PREPROCESSED_COLS).enumerate().for_each(|(j, row)| {
+                    let idx = i * chunk_size + j;
+                    if idx < instrs.len() {
+                        let cols: &mut SelectPreprocessedCols<_> = row.borrow_mut();
+                        unsafe {
+                            crate::sys::select_instr_to_row_babybear(instrs[idx], cols);
+                        }
+                    }
+                });
+            });
+
+        RowMajorMatrix::new(values, SELECT_PREPROCESSED_COLS)
+    }
 }
diff --git a/crates/recursion/core/src/lib.rs b/crates/recursion/core/src/lib.rs
index bec9e0b0ab..0fc2cd021d 100644
--- a/crates/recursion/core/src/lib.rs
+++ b/crates/recursion/core/src/lib.rs
@@ -11,6 +11,8 @@ pub mod machine;
 pub mod runtime;
 pub mod shape;
 pub mod stark;
+#[cfg(feature = "sys")]
+pub mod sys;
 
 pub use runtime::*;
 
@@ -47,6 +49,7 @@ pub type BaseAluEvent<F> = BaseAluIo<F>;
 
 /// An instruction invoking the extension field ALU.
 #[derive(Clone, Debug, Serialize, Deserialize)]
+#[repr(C)]
 pub struct BaseAluInstr<F> {
     pub opcode: BaseAluOpcode,
     pub mult: F,
@@ -68,6 +71,7 @@ pub type ExtAluEvent<F> = ExtAluIo<Block<F>>;
 
 /// An instruction invoking the extension field ALU.
 #[derive(Clone, Debug, Serialize, Deserialize)]
+#[repr(C)]
 pub struct ExtAluInstr<F> {
     pub opcode: ExtAluOpcode,
     pub mult: F,
@@ -102,6 +106,7 @@ pub enum MemAccessKind {
 
 /// The inputs and outputs to a Poseidon2 permutation.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct Poseidon2Io<V> {
     pub input: [V; WIDTH],
     pub output: [V; WIDTH],
@@ -109,6 +114,7 @@ pub struct Poseidon2Io<V> {
 
 /// An instruction invoking the Poseidon2 permutation.
 #[derive(Clone, Debug, Serialize, Deserialize)]
+#[repr(C)]
 pub struct Poseidon2SkinnyInstr<F> {
     pub addrs: Poseidon2Io<Address<F>>,
     pub mults: [F; WIDTH],
@@ -118,6 +124,7 @@ pub type Poseidon2Event<F> = Poseidon2Io<F>;
 
 /// The inputs and outputs to a select operation.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct SelectIo<V> {
     pub bit: V,
     pub out1: V,
@@ -128,6 +135,7 @@ pub struct SelectIo<V> {
 
 /// An instruction invoking the select operation.
 #[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+#[repr(C)]
 pub struct SelectInstr<F> {
     pub addrs: SelectIo<Address<F>>,
     pub mult1: F,
@@ -156,6 +164,30 @@ pub struct ExpReverseBitsInstr<F> {
     pub mult: F,
 }
 
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[repr(C)]
+pub struct ExpReverseBitsInstrFFI<'a, F> {
+    pub base: &'a Address<F>,
+    pub exp_ptr: *const Address<F>,
+    pub exp_len: usize,
+    pub result: &'a Address<F>,
+
+    pub mult: &'a F,
+}
+
+impl<'a, F> From<&'a ExpReverseBitsInstr<F>> for ExpReverseBitsInstrFFI<'a, F> {
+    fn from(instr: &'a ExpReverseBitsInstr<F>) -> Self {
+        Self {
+            base: &instr.addrs.base,
+            exp_ptr: instr.addrs.exp.as_ptr(),
+            exp_len: instr.addrs.exp.len(),
+            result: &instr.addrs.result,
+
+            mult: &instr.mult,
+        }
+    }
+}
+
 /// The event encoding the inputs and outputs of an exp-reverse-bits operation. The `len` operand is
 /// now stored as the length of the `exp` field.
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
@@ -165,6 +197,26 @@ pub struct ExpReverseBitsEvent<F> {
     pub result: F,
 }
 
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[repr(C)]
+pub struct ExpReverseBitsEventFFI<'a, F> {
+    pub base: &'a F,
+    pub exp_ptr: *const F,
+    pub exp_len: usize,
+    pub result: &'a F,
+}
+
+impl<'a, F> From<&'a ExpReverseBitsEvent<F>> for ExpReverseBitsEventFFI<'a, F> {
+    fn from(event: &'a ExpReverseBitsEvent<F>) -> Self {
+        Self {
+            base: &event.base,
+            exp_ptr: event.exp.as_ptr(),
+            exp_len: event.exp.len(),
+            result: &event.result,
+        }
+    }
+}
+
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct FriFoldIo<V> {
     pub ext_single: FriFoldExtSingleIo<Block<V>>,
@@ -173,14 +225,16 @@ pub struct FriFoldIo<V> {
 }
 
 /// The extension-field-valued single inputs to the FRI fold operation.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct FriFoldExtSingleIo<V> {
     pub z: V,
     pub alpha: V,
 }
 
 /// The extension-field-valued vector inputs to the FRI fold operation.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct FriFoldExtVecIo<V> {
     pub mat_opening: V,
     pub ps_at_z: V,
@@ -191,7 +245,8 @@ pub struct FriFoldExtVecIo<V> {
 }
 
 /// The base-field-valued inputs to the FRI fold operation.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct FriFoldBaseIo<V> {
     pub x: V,
 }
@@ -207,10 +262,71 @@ pub struct FriFoldInstr<F> {
     pub ro_mults: Vec<F>,
 }
 
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[repr(C)]
+pub struct FriFoldInstrFFI<'a, F> {
+    pub base_single_addrs: &'a FriFoldBaseIo<Address<F>>,
+    pub ext_single_addrs: &'a FriFoldExtSingleIo<Address<F>>,
+
+    pub ext_vec_addrs_mat_opening_ptr: *const Address<F>,
+    pub ext_vec_addrs_mat_opening_len: usize,
+    pub ext_vec_addrs_ps_at_z_ptr: *const Address<F>,
+    pub ext_vec_addrs_ps_at_z_len: usize,
+    pub ext_vec_addrs_alpha_pow_input_ptr: *const Address<F>,
+    pub ext_vec_addrs_alpha_pow_input_len: usize,
+    pub ext_vec_addrs_ro_input_ptr: *const Address<F>,
+    pub ext_vec_addrs_ro_input_len: usize,
+    pub ext_vec_addrs_alpha_pow_output_ptr: *const Address<F>,
+    pub ext_vec_addrs_alpha_pow_output_len: usize,
+    pub ext_vec_addrs_ro_output_ptr: *const Address<F>,
+    pub ext_vec_addrs_ro_output_len: usize,
+
+    pub alpha_pow_mults_ptr: *const F,
+    pub alpha_pow_mults_len: usize,
+
+    pub ro_mults_ptr: *const F,
+    pub ro_mults_len: usize,
+}
+
+impl<'a, F> From<&'a FriFoldInstr<F>> for FriFoldInstrFFI<'a, F> {
+    fn from(instr: &'a FriFoldInstr<F>) -> Self {
+        Self {
+            base_single_addrs: &instr.base_single_addrs,
+            ext_single_addrs: &instr.ext_single_addrs,
+
+            ext_vec_addrs_mat_opening_ptr: instr.ext_vec_addrs.mat_opening.as_ptr(),
+            ext_vec_addrs_mat_opening_len: instr.ext_vec_addrs.mat_opening.len(),
+            ext_vec_addrs_ps_at_z_ptr: instr.ext_vec_addrs.ps_at_z.as_ptr(),
+            ext_vec_addrs_ps_at_z_len: instr.ext_vec_addrs.ps_at_z.len(),
+            ext_vec_addrs_alpha_pow_input_ptr: instr.ext_vec_addrs.alpha_pow_input.as_ptr(),
+            ext_vec_addrs_alpha_pow_input_len: instr.ext_vec_addrs.alpha_pow_input.len(),
+            ext_vec_addrs_ro_input_ptr: instr.ext_vec_addrs.ro_input.as_ptr(),
+            ext_vec_addrs_ro_input_len: instr.ext_vec_addrs.ro_input.len(),
+            ext_vec_addrs_alpha_pow_output_ptr: instr.ext_vec_addrs.alpha_pow_output.as_ptr(),
+            ext_vec_addrs_alpha_pow_output_len: instr.ext_vec_addrs.alpha_pow_output.len(),
+            ext_vec_addrs_ro_output_ptr: instr.ext_vec_addrs.ro_output.as_ptr(),
+            ext_vec_addrs_ro_output_len: instr.ext_vec_addrs.ro_output.len(),
+
+            alpha_pow_mults_ptr: instr.alpha_pow_mults.as_ptr(),
+            alpha_pow_mults_len: instr.alpha_pow_mults.len(),
+
+            ro_mults_ptr: instr.ro_mults.as_ptr(),
+            ro_mults_len: instr.ro_mults.len(),
+        }
+    }
+}
+
+impl<'a, F> From<&'a Box<FriFoldInstr<F>>> for FriFoldInstrFFI<'a, F> {
+    fn from(instr: &'a Box<FriFoldInstr<F>>) -> Self {
+        Self::from(instr.as_ref())
+    }
+}
+
 /// The event encoding the data of a single iteration within the FRI fold operation.
 /// For any given event, we are accessing a single element of the `Vec` inputs, so that the event
 /// is not a type alias for `FriFoldIo` like many of the other events.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct FriFoldEvent<F> {
     pub base_single: FriFoldBaseIo<F>,
     pub ext_single: FriFoldExtSingleIo<Block<F>>,
@@ -225,20 +341,23 @@ pub struct BatchFRIIo<V> {
 }
 
 /// The extension-field-valued single inputs to the batch FRI operation.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct BatchFRIExtSingleIo<V> {
     pub acc: V,
 }
 
 /// The extension-field-valued vector inputs to the batch FRI operation.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct BatchFRIExtVecIo<V> {
     pub p_at_z: V,
     pub alpha_pow: V,
 }
 
 /// The base-field-valued vector inputs to the batch FRI operation.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct BatchFRIBaseVecIo<V> {
     pub p_at_x: V,
 }
@@ -253,10 +372,51 @@ pub struct BatchFRIInstr<F> {
     pub acc_mult: F,
 }
 
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[repr(C)]
+pub struct BatchFRIInstrFFI<'a, F> {
+    pub base_vec_addrs_p_at_x_ptr: *const Address<F>,
+    pub base_vec_addrs_p_at_x_len: usize,
+
+    pub ext_single_addrs: &'a BatchFRIExtSingleIo<Address<F>>,
+
+    pub ext_vec_addrs_p_at_z_ptr: *const Address<F>,
+    pub ext_vec_addrs_p_at_z_len: usize,
+    pub ext_vec_addrs_alpha_pow_ptr: *const Address<F>,
+    pub ext_vec_addrs_alpha_pow_len: usize,
+
+    pub acc_mult: &'a F,
+}
+
+impl<'a, F> From<&'a BatchFRIInstr<F>> for BatchFRIInstrFFI<'a, F> {
+    fn from(instr: &'a BatchFRIInstr<F>) -> Self {
+        Self {
+            base_vec_addrs_p_at_x_ptr: instr.base_vec_addrs.p_at_x.as_ptr(),
+            base_vec_addrs_p_at_x_len: instr.base_vec_addrs.p_at_x.len(),
+
+            ext_single_addrs: &instr.ext_single_addrs,
+
+            ext_vec_addrs_p_at_z_ptr: instr.ext_vec_addrs.p_at_z.as_ptr(),
+            ext_vec_addrs_p_at_z_len: instr.ext_vec_addrs.p_at_z.len(),
+            ext_vec_addrs_alpha_pow_ptr: instr.ext_vec_addrs.alpha_pow.as_ptr(),
+            ext_vec_addrs_alpha_pow_len: instr.ext_vec_addrs.alpha_pow.len(),
+
+            acc_mult: &instr.acc_mult,
+        }
+    }
+}
+
+impl<'a, 'b: 'a, F> From<&'b &'b Box<BatchFRIInstr<F>>> for BatchFRIInstrFFI<'a, F> {
+    fn from(instr: &'b &'b Box<BatchFRIInstr<F>>) -> Self {
+        Self::from(instr.as_ref())
+    }
+}
+
 /// The event encoding the data of a single iteration within the batch FRI operation.
 /// For any given event, we are accessing a single element of the `Vec` inputs, so that the event
 /// is not a type alias for `BatchFRIIo` like many of the other events.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[repr(C)]
 pub struct BatchFRIEvent<F> {
     pub base_vec: BatchFRIBaseVecIo<F>,
     pub ext_single: BatchFRIExtSingleIo<Block<F>>,
@@ -266,12 +426,14 @@ pub struct BatchFRIEvent<F> {
 /// An instruction that will save the public values to the execution record and will commit to
 /// it's digest.
 #[derive(Clone, Debug, Serialize, Deserialize)]
+#[repr(C)]
 pub struct CommitPublicValuesInstr<F> {
     pub pv_addrs: RecursionPublicValues<Address<F>>,
 }
 
 /// The event for committing to the public values.
-#[derive(Clone, Debug, Serialize, Deserialize)]
+#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
+#[repr(C)]
 pub struct CommitPublicValuesEvent<F> {
     pub public_values: RecursionPublicValues<F>,
 }
diff --git a/crates/recursion/core/src/machine.rs b/crates/recursion/core/src/machine.rs
index b143487d1d..d754367ef6 100644
--- a/crates/recursion/core/src/machine.rs
+++ b/crates/recursion/core/src/machine.rs
@@ -156,10 +156,10 @@ impl<F: PrimeField32 + BinomiallyExtendable<D>, const DEGREE: usize> RecursionAi
             [
                 (Self::MemoryConst(MemoryConstChip::default()), 17),
                 (Self::MemoryVar(MemoryVarChip::default()), 18),
-                (Self::BaseAlu(BaseAluChip), 20),
-                (Self::ExtAlu(ExtAluChip), 18),
+                (Self::BaseAlu(BaseAluChip), 15),
+                (Self::ExtAlu(ExtAluChip), 15),
                 (Self::Poseidon2Wide(Poseidon2WideChip::<DEGREE>), 16),
-                (Self::BatchFRI(BatchFRIChip::<DEGREE>), 18),
+                (Self::BatchFRI(BatchFRIChip::<DEGREE>), 17),
                 (Self::Select(SelectChip), 18),
                 (Self::ExpReverseBitsLen(ExpReverseBitsLenChip::<DEGREE>), 17),
                 (Self::PublicValues(PublicValuesChip), PUB_VALUES_LOG_HEIGHT),
@@ -231,6 +231,10 @@ impl<F> AddAssign<&Instruction<F>> for RecursionAirEventCount {
             Instruction::BatchFRI(instr) => {
                 self.batch_fri_events += instr.base_vec_addrs.p_at_x.len()
             }
+            Instruction::HintAddCurve(instr) => {
+                self.mem_var_events += instr.output_x_addrs_mults.len();
+                self.mem_var_events += instr.output_y_addrs_mults.len();
+            }
             Instruction::CommitPublicValues(_) => {}
             Instruction::Print(_) => {}
         }
diff --git a/crates/recursion/core/src/runtime/instruction.rs b/crates/recursion/core/src/runtime/instruction.rs
index 7a74097246..97e2c203fc 100644
--- a/crates/recursion/core/src/runtime/instruction.rs
+++ b/crates/recursion/core/src/runtime/instruction.rs
@@ -14,6 +14,7 @@ pub enum Instruction<F> {
     Select(SelectInstr<F>),
     ExpReverseBitsLen(ExpReverseBitsInstr<F>),
     HintBits(HintBitsInstr<F>),
+    HintAddCurve(Box<HintAddCurveInstr<F>>),
     FriFold(Box<FriFoldInstr<F>>),
     BatchFRI(Box<BatchFRIInstr<F>>),
     Print(PrintInstr<F>),
@@ -36,6 +37,15 @@ pub struct PrintInstr<F> {
     pub addr: Address<F>,
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct HintAddCurveInstr<F> {
+    pub output_x_addrs_mults: Vec<(Address<F>, F)>,
+    pub output_y_addrs_mults: Vec<(Address<F>, F)>,
+    pub input1_x_addrs: Vec<Address<F>>,
+    pub input1_y_addrs: Vec<Address<F>>,
+    pub input2_x_addrs: Vec<Address<F>>,
+    pub input2_y_addrs: Vec<Address<F>>,
+}
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct HintInstr<F> {
     /// Addresses and mults of the output felts.
diff --git a/crates/recursion/core/src/runtime/mod.rs b/crates/recursion/core/src/runtime/mod.rs
index 90bdd1fb55..4c31c592f8 100644
--- a/crates/recursion/core/src/runtime/mod.rs
+++ b/crates/recursion/core/src/runtime/mod.rs
@@ -6,14 +6,23 @@ mod record;
 
 // Avoid triggering annoying branch of thiserror derive macro.
 use backtrace::Backtrace as Trace;
+use hashbrown::HashMap;
+use instruction::HintAddCurveInstr;
 pub use instruction::Instruction;
 use instruction::{FieldEltType, HintBitsInstr, HintExt2FeltsInstr, HintInstr, PrintInstr};
+use itertools::Itertools;
 use machine::RecursionAirEventCount;
 use memory::*;
 pub use opcode::*;
+use p3_field::AbstractExtensionField;
+use p3_field::{AbstractField, ExtensionField, PrimeField32};
+use p3_poseidon2::{Poseidon2, Poseidon2ExternalMatrixGeneral};
+use p3_symmetric::{CryptographicPermutation, Permutation};
+use p3_util::reverse_bits_len;
 pub use program::*;
 pub use record::*;
-
+use sp1_stark::septic_curve::SepticCurve;
+use sp1_stark::septic_extension::SepticExtension;
 use std::{
     array,
     borrow::Borrow,
@@ -24,13 +33,6 @@ use std::{
     marker::PhantomData,
     sync::Arc,
 };
-
-use hashbrown::HashMap;
-use itertools::Itertools;
-use p3_field::{AbstractField, ExtensionField, PrimeField32};
-use p3_poseidon2::{Poseidon2, Poseidon2ExternalMatrixGeneral};
-use p3_symmetric::{CryptographicPermutation, Permutation};
-use p3_util::reverse_bits_len;
 use thiserror::Error;
 
 use crate::air::{Block, RECURSIVE_PROOF_NUM_PV_ELTS};
@@ -40,9 +42,8 @@ use crate::*;
 
 /// The heap pointer address.
 pub const HEAP_PTR: i32 = -4;
-pub const HEAP_START_ADDRESS: usize = STACK_SIZE + 4;
-
 pub const STACK_SIZE: usize = 1 << 24;
+pub const HEAP_START_ADDRESS: usize = STACK_SIZE + 4;
 pub const MEMORY_SIZE: usize = 1 << 28;
 
 /// The width of the Poseidon2 permutation.
@@ -415,6 +416,44 @@ where
                         self.record.mem_var_events.push(MemEvent { inner: bit });
                     }
                 }
+                Instruction::HintAddCurve(instr) => {
+                    let HintAddCurveInstr {
+                        output_x_addrs_mults,
+                        output_y_addrs_mults,
+                        input1_x_addrs,
+                        input1_y_addrs,
+                        input2_x_addrs,
+                        input2_y_addrs,
+                    } = *instr;
+                    let input1_x = SepticExtension::<F>::from_base_fn(|i| {
+                        self.memory.mr_mult(input1_x_addrs[i], F::zero()).val[0]
+                    });
+                    let input1_y = SepticExtension::<F>::from_base_fn(|i| {
+                        self.memory.mr_mult(input1_y_addrs[i], F::zero()).val[0]
+                    });
+                    let input2_x = SepticExtension::<F>::from_base_fn(|i| {
+                        self.memory.mr_mult(input2_x_addrs[i], F::zero()).val[0]
+                    });
+                    let input2_y = SepticExtension::<F>::from_base_fn(|i| {
+                        self.memory.mr_mult(input2_y_addrs[i], F::zero()).val[0]
+                    });
+                    let point1 = SepticCurve { x: input1_x, y: input1_y };
+                    let point2 = SepticCurve { x: input2_x, y: input2_y };
+                    let output = point1.add_incomplete(point2);
+
+                    for (val, (addr, mult)) in
+                        output.x.0.into_iter().zip(output_x_addrs_mults.into_iter())
+                    {
+                        self.memory.mw(addr, Block::from(val), mult);
+                        self.record.mem_var_events.push(MemEvent { inner: Block::from(val) });
+                    }
+                    for (val, (addr, mult)) in
+                        output.y.0.into_iter().zip(output_y_addrs_mults.into_iter())
+                    {
+                        self.memory.mw(addr, Block::from(val), mult);
+                        self.record.mem_var_events.push(MemEvent { inner: Block::from(val) });
+                    }
+                }
 
                 Instruction::FriFold(instr) => {
                     let FriFoldInstr {
diff --git a/crates/recursion/core/src/runtime/opcode.rs b/crates/recursion/core/src/runtime/opcode.rs
index 96a748d065..16e9ef575d 100644
--- a/crates/recursion/core/src/runtime/opcode.rs
+++ b/crates/recursion/core/src/runtime/opcode.rs
@@ -1,6 +1,7 @@
 use serde::{Deserialize, Serialize};
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[repr(C)]
 pub enum BaseAluOpcode {
     AddF,
     SubF,
@@ -9,6 +10,7 @@ pub enum BaseAluOpcode {
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[repr(C)]
 pub enum ExtAluOpcode {
     AddE,
     SubE,
diff --git a/crates/recursion/core/src/runtime/program.rs b/crates/recursion/core/src/runtime/program.rs
index 38fb29ca8a..f77aeeed07 100644
--- a/crates/recursion/core/src/runtime/program.rs
+++ b/crates/recursion/core/src/runtime/program.rs
@@ -1,10 +1,10 @@
+use crate::*;
 use backtrace::Backtrace;
 use p3_field::Field;
 use serde::{Deserialize, Serialize};
 use shape::RecursionShape;
 use sp1_stark::air::{MachineAir, MachineProgram};
-
-use crate::*;
+use sp1_stark::septic_digest::SepticDigest;
 
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct RecursionProgram<F> {
@@ -19,6 +19,10 @@ impl<F: Field> MachineProgram<F> for RecursionProgram<F> {
     fn pc_start(&self) -> F {
         F::zero()
     }
+
+    fn initial_global_cumulative_sum(&self) -> SepticDigest<F> {
+        SepticDigest::<F>::zero()
+    }
 }
 
 impl<F: Field> RecursionProgram<F> {
diff --git a/crates/recursion/core/src/shape.rs b/crates/recursion/core/src/shape.rs
index 39f47cf72c..32d034b9ee 100644
--- a/crates/recursion/core/src/shape.rs
+++ b/crates/recursion/core/src/shape.rs
@@ -27,6 +27,18 @@ pub struct RecursionShape {
     pub(crate) inner: HashMap<String, usize>,
 }
 
+impl RecursionShape {
+    pub fn clone_into_hash_map(&self) -> HashMap<String, usize> {
+        self.inner.clone()
+    }
+}
+
+impl From<HashMap<String, usize>> for RecursionShape {
+    fn from(value: HashMap<String, usize>) -> Self {
+        Self { inner: value }
+    }
+}
+
 pub struct RecursionShapeConfig<F, A> {
     allowed_shapes: Vec<HashMap<String, usize>>,
     _marker: PhantomData<(F, A)>,
@@ -81,6 +93,27 @@ impl<F: PrimeField32 + BinomiallyExtendable<D>, const DEGREE: usize>
             })
             .multi_cartesian_product()
     }
+
+    pub fn union_config_with_extra_room(&self) -> Self {
+        let mut map = HashMap::new();
+        for shape in self.allowed_shapes.clone() {
+            for key in shape.keys() {
+                let current = map.get(key).unwrap_or(&0);
+                map.insert(key.clone(), *current.max(shape.get(key).unwrap()));
+            }
+        }
+        map.values_mut().for_each(|x| *x += 2);
+        map.insert("PublicValues".to_string(), 4);
+        Self { allowed_shapes: vec![map], _marker: PhantomData }
+    }
+
+    pub fn from_hash_map(hash_map: &HashMap<String, usize>) -> Self {
+        Self { allowed_shapes: vec![hash_map.clone()], _marker: PhantomData }
+    }
+
+    pub fn first(&self) -> Option<&HashMap<String, usize>> {
+        self.allowed_shapes.first()
+    }
 }
 
 impl<F: PrimeField32 + BinomiallyExtendable<D>, const DEGREE: usize> Default
@@ -103,135 +136,25 @@ impl<F: PrimeField32 + BinomiallyExtendable<D>, const DEGREE: usize> Default
         // Specify allowed shapes.
         let allowed_shapes = [
             [
-                (base_alu.clone(), 20),
-                (mem_var.clone(), 18),
-                (ext_alu.clone(), 18),
-                (exp_reverse_bits_len.clone(), 17),
-                (mem_const.clone(), 17),
-                (poseidon2_wide.clone(), 16),
-                (batch_fri.clone(), 18),
-                (select.clone(), 18),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (base_alu.clone(), 20),
-                (mem_var.clone(), 18),
-                (ext_alu.clone(), 18),
-                (exp_reverse_bits_len.clone(), 17),
-                (mem_const.clone(), 16),
-                (poseidon2_wide.clone(), 16),
-                (batch_fri.clone(), 18),
-                (select.clone(), 18),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (ext_alu.clone(), 20),
-                (base_alu.clone(), 19),
+                (ext_alu.clone(), 16),
+                (base_alu.clone(), 16),
                 (mem_var.clone(), 19),
                 (poseidon2_wide.clone(), 17),
-                (mem_const.clone(), 16),
-                (exp_reverse_bits_len.clone(), 16),
-                (batch_fri.clone(), 20),
-                (select.clone(), 18),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (base_alu.clone(), 19),
-                (mem_var.clone(), 18),
-                (ext_alu.clone(), 18),
-                (exp_reverse_bits_len.clone(), 17),
-                (mem_const.clone(), 16),
-                (poseidon2_wide.clone(), 16),
-                (batch_fri.clone(), 18),
-                (select.clone(), 18),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (base_alu.clone(), 19),
-                (mem_var.clone(), 18),
-                (ext_alu.clone(), 18),
-                (exp_reverse_bits_len.clone(), 16),
-                (mem_const.clone(), 16),
-                (poseidon2_wide.clone(), 16),
+                (mem_const.clone(), 18),
                 (batch_fri.clone(), 18),
-                (select.clone(), 18),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (base_alu.clone(), 20),
-                (mem_var.clone(), 19),
-                (ext_alu.clone(), 19),
-                (exp_reverse_bits_len.clone(), 17),
-                (mem_const.clone(), 17),
-                (poseidon2_wide.clone(), 17),
-                (batch_fri.clone(), 19),
-                (select.clone(), 19),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (base_alu.clone(), 21),
-                (mem_var.clone(), 19),
-                (ext_alu.clone(), 19),
                 (exp_reverse_bits_len.clone(), 18),
-                (mem_const.clone(), 18),
-                (poseidon2_wide.clone(), 17),
-                (batch_fri.clone(), 19),
                 (select.clone(), 19),
                 (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
             ],
             [
-                (base_alu.clone(), 21),
+                (ext_alu.clone(), 17),
+                (base_alu.clone(), 16),
                 (mem_var.clone(), 19),
-                (ext_alu.clone(), 19),
-                (exp_reverse_bits_len.clone(), 18),
-                (mem_const.clone(), 17),
                 (poseidon2_wide.clone(), 17),
-                (batch_fri.clone(), 19),
-                (select.clone(), 19),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (ext_alu.clone(), 21),
-                (base_alu.clone(), 20),
-                (mem_var.clone(), 20),
-                (poseidon2_wide.clone(), 18),
-                (mem_const.clone(), 17),
-                (exp_reverse_bits_len.clone(), 17),
+                (mem_const.clone(), 18),
                 (batch_fri.clone(), 21),
-                (select.clone(), 19),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (base_alu.clone(), 20),
-                (mem_var.clone(), 19),
-                (ext_alu.clone(), 19),
-                (exp_reverse_bits_len.clone(), 18),
-                (mem_const.clone(), 17),
-                (poseidon2_wide.clone(), 17),
-                (batch_fri.clone(), 19),
-                (select.clone(), 19),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (base_alu.clone(), 20),
-                (mem_var.clone(), 19),
-                (ext_alu.clone(), 19),
-                (exp_reverse_bits_len.clone(), 17),
-                (mem_const.clone(), 17),
-                (poseidon2_wide.clone(), 17),
-                (batch_fri.clone(), 19),
-                (select.clone(), 19),
-                (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
-            ],
-            [
-                (base_alu.clone(), 21),
-                (mem_var.clone(), 20),
-                (ext_alu.clone(), 20),
                 (exp_reverse_bits_len.clone(), 18),
-                (mem_const.clone(), 18),
-                (poseidon2_wide.clone(), 18),
-                (batch_fri.clone(), 20),
-                (select.clone(), 19),
+                (select.clone(), 20),
                 (public_values.clone(), PUB_VALUES_LOG_HEIGHT),
             ],
         ]
diff --git a/crates/recursion/core/src/sys.rs b/crates/recursion/core/src/sys.rs
new file mode 100644
index 0000000000..1ddb6ce0b9
--- /dev/null
+++ b/crates/recursion/core/src/sys.rs
@@ -0,0 +1,117 @@
+use crate::{
+    air::Block,
+    chips::{
+        alu_base::{BaseAluAccessCols, BaseAluValueCols},
+        alu_ext::{ExtAluAccessCols, ExtAluValueCols},
+        batch_fri::{BatchFRICols, BatchFRIPreprocessedCols},
+        exp_reverse_bits::{ExpReverseBitsLenCols, ExpReverseBitsLenPreprocessedCols},
+        fri_fold::{FriFoldCols, FriFoldPreprocessedCols},
+        poseidon2_skinny::{
+            columns::{preprocessed::Poseidon2PreprocessedColsSkinny, Poseidon2},
+            NUM_EXTERNAL_ROUNDS, NUM_INTERNAL_ROUNDS,
+        },
+        poseidon2_wide::columns::preprocessed::Poseidon2PreprocessedColsWide,
+        public_values::{PublicValuesCols, PublicValuesPreprocessedCols},
+        select::{SelectCols, SelectPreprocessedCols},
+    },
+    BaseAluInstr, BaseAluIo, BatchFRIEvent, BatchFRIInstrFFI, CommitPublicValuesEvent,
+    CommitPublicValuesInstr, ExpReverseBitsEventFFI, ExpReverseBitsInstrFFI, ExtAluInstr, ExtAluIo,
+    FriFoldEvent, FriFoldInstrFFI, Poseidon2Event, Poseidon2Instr, SelectEvent, SelectInstr, WIDTH,
+};
+use p3_baby_bear::BabyBear;
+
+#[link(name = "sp1-recursion-core-sys", kind = "static")]
+extern "C-unwind" {
+    pub fn alu_base_event_to_row_babybear(
+        io: &BaseAluIo<BabyBear>,
+        cols: &mut BaseAluValueCols<BabyBear>,
+    );
+    pub fn alu_base_instr_to_row_babybear(
+        instr: &BaseAluInstr<BabyBear>,
+        cols: &mut BaseAluAccessCols<BabyBear>,
+    );
+
+    pub fn alu_ext_event_to_row_babybear(
+        io: &ExtAluIo<Block<BabyBear>>,
+        cols: &mut ExtAluValueCols<BabyBear>,
+    );
+    pub fn alu_ext_instr_to_row_babybear(
+        instr: &ExtAluInstr<BabyBear>,
+        cols: &mut ExtAluAccessCols<BabyBear>,
+    );
+
+    pub fn batch_fri_event_to_row_babybear(
+        io: &BatchFRIEvent<BabyBear>,
+        cols: &mut BatchFRICols<BabyBear>,
+    );
+    pub fn batch_fri_instr_to_row_babybear(
+        instr: &BatchFRIInstrFFI<BabyBear>,
+        cols: &mut BatchFRIPreprocessedCols<BabyBear>,
+    );
+
+    pub fn exp_reverse_bits_event_to_row_babybear(
+        io: &ExpReverseBitsEventFFI<BabyBear>,
+        i: usize,
+        cols: &mut ExpReverseBitsLenCols<BabyBear>,
+    );
+    pub fn exp_reverse_bits_instr_to_row_babybear(
+        instr: &ExpReverseBitsInstrFFI<BabyBear>,
+        i: usize,
+        len: usize,
+        cols: &mut ExpReverseBitsLenPreprocessedCols<BabyBear>,
+    );
+
+    pub fn fri_fold_event_to_row_babybear(
+        io: &FriFoldEvent<BabyBear>,
+        cols: &mut FriFoldCols<BabyBear>,
+    );
+    pub fn fri_fold_instr_to_row_babybear(
+        instr: &FriFoldInstrFFI<BabyBear>,
+        i: usize,
+        cols: &mut FriFoldPreprocessedCols<BabyBear>,
+    );
+
+    pub fn public_values_event_to_row_babybear(
+        io: &CommitPublicValuesEvent<BabyBear>,
+        digest_idx: usize,
+        cols: &mut PublicValuesCols<BabyBear>,
+    );
+    pub fn public_values_instr_to_row_babybear(
+        instr: &CommitPublicValuesInstr<BabyBear>,
+        digest_idx: usize,
+        cols: &mut PublicValuesPreprocessedCols<BabyBear>,
+    );
+
+    pub fn select_event_to_row_babybear(
+        io: &SelectEvent<BabyBear>,
+        cols: &mut SelectCols<BabyBear>,
+    );
+    pub fn select_instr_to_row_babybear(
+        instr: &SelectInstr<BabyBear>,
+        cols: &mut SelectPreprocessedCols<BabyBear>,
+    );
+
+    pub fn poseidon2_skinny_event_to_row_babybear(
+        io: &Poseidon2Event<BabyBear>,
+        cols: *mut Poseidon2<BabyBear>,
+    );
+    pub fn poseidon2_skinny_instr_to_row_babybear(
+        instr: &Poseidon2Instr<BabyBear>,
+        i: usize,
+        cols: &mut Poseidon2PreprocessedColsSkinny<BabyBear>,
+    );
+
+    pub fn poseidon2_wide_event_to_row_babybear(
+        input: &[BabyBear; WIDTH],
+        external_rounds_state: *mut BabyBear,
+        internal_rounds_state: &mut [BabyBear; WIDTH],
+        internal_rounds_s0: &mut [BabyBear; NUM_INTERNAL_ROUNDS - 1],
+        external_sbox: *mut [[BabyBear; NUM_EXTERNAL_ROUNDS]; WIDTH],
+        internal_sbox: *mut [BabyBear; NUM_INTERNAL_ROUNDS],
+        output_state: &mut [BabyBear; WIDTH],
+    );
+    pub fn poseidon2_wide_instr_to_row_babybear(
+        instr: &Poseidon2Instr<BabyBear>,
+        cols: &mut Poseidon2PreprocessedColsWide<BabyBear>,
+    );
+}
diff --git a/crates/recursion/gnark-ffi/src/ffi/docker.rs b/crates/recursion/gnark-ffi/src/ffi/docker.rs
index bf0f2865bd..56d416be8d 100644
--- a/crates/recursion/gnark-ffi/src/ffi/docker.rs
+++ b/crates/recursion/gnark-ffi/src/ffi/docker.rs
@@ -1,5 +1,4 @@
-use crate::ProofBn254;
-use crate::{Groth16Bn254Proof, PlonkBn254Proof};
+use crate::{Groth16Bn254Proof, PlonkBn254Proof, ProofBn254};
 use anyhow::{anyhow, Result};
 use sp1_core_machine::SP1_CIRCUIT_VERSION;
 use std::{io::Write, process::Command};
diff --git a/crates/sdk/src/network-v2/client.rs b/crates/sdk/src/network-v2/client.rs
index 24e7469a54..91c5eea001 100644
--- a/crates/sdk/src/network-v2/client.rs
+++ b/crates/sdk/src/network-v2/client.rs
@@ -4,26 +4,28 @@ use alloy_signer::SignerSync;
 use alloy_signer_local::PrivateKeySigner;
 use anyhow::{Context, Ok, Result};
 use reqwest_middleware::ClientWithMiddleware as HttpClientWithMiddleware;
-use serde::de::DeserializeOwned;
-use serde::Serialize;
+use serde::{de::DeserializeOwned, Serialize};
 use sp1_core_machine::io::SP1Stdin;
 use sp1_prover::SP1VerifyingKey;
-use std::str::FromStr;
-use std::time::{SystemTime, UNIX_EPOCH};
-use tokio::try_join;
-use tonic::transport::channel::ClientTlsConfig;
-use tonic::transport::Channel;
-
-use crate::network_v2::proto::artifact::{
-    artifact_store_client::ArtifactStoreClient, CreateArtifactRequest,
+use std::{
+    str::FromStr,
+    time::{SystemTime, UNIX_EPOCH},
 };
-use crate::network_v2::proto::network::{
-    prover_network_client::ProverNetworkClient, GetFilteredProofRequestsRequest,
-    GetFilteredProofRequestsResponse, GetNonceRequest, GetProofRequestStatusRequest,
-    GetProofRequestStatusResponse, ProofMode, ProofStatus, ProofStrategy, RequestProofRequest,
-    RequestProofRequestBody, RequestProofResponse,
+use tokio::try_join;
+use tonic::transport::{channel::ClientTlsConfig, Channel};
+
+use crate::network_v2::{
+    proto::{
+        artifact::{artifact_store_client::ArtifactStoreClient, CreateArtifactRequest},
+        network::{
+            prover_network_client::ProverNetworkClient, GetFilteredProofRequestsRequest,
+            GetFilteredProofRequestsResponse, GetNonceRequest, GetProofRequestStatusRequest,
+            GetProofRequestStatusResponse, ProofMode, ProofStatus, ProofStrategy,
+            RequestProofRequest, RequestProofRequestBody, RequestProofResponse,
+        },
+    },
+    Signable,
 };
-use crate::network_v2::Signable;
 
 /// The default RPC endpoint for the Succinct prover network.
 pub const DEFAULT_PROVER_NETWORK_RPC: &str = "https://rpc.production.succinct.tools/";
diff --git a/crates/sdk/src/network-v2/prover.rs b/crates/sdk/src/network-v2/prover.rs
index 83a86b55eb..11d1872455 100644
--- a/crates/sdk/src/network-v2/prover.rs
+++ b/crates/sdk/src/network-v2/prover.rs
@@ -14,7 +14,8 @@ use sp1_prover::{components::DefaultProverComponents, SP1Prover, SP1_CIRCUIT_VER
 use sp1_stark::SP1ProverOpts;
 use tonic::Code;
 
-use {crate::block_on, tokio::time::sleep};
+use crate::block_on;
+use tokio::time::sleep;
 
 use crate::provers::{CpuProver, ProofOpts, ProverType};
 
diff --git a/crates/sdk/src/network/prover.rs b/crates/sdk/src/network/prover.rs
index 70969adf5b..7b9a480f70 100644
--- a/crates/sdk/src/network/prover.rs
+++ b/crates/sdk/src/network/prover.rs
@@ -15,7 +15,8 @@ use sp1_stark::SP1ProverOpts;
 
 use super::proto::network::GetProofStatusResponse;
 
-use {crate::block_on, tokio::time::sleep};
+use crate::block_on;
+use tokio::time::sleep;
 
 use crate::provers::{CpuProver, ProofOpts, ProverType};
 
diff --git a/crates/sdk/src/provers/cpu.rs b/crates/sdk/src/provers/cpu.rs
index 234e663909..cb3a98289f 100644
--- a/crates/sdk/src/provers/cpu.rs
+++ b/crates/sdk/src/provers/cpu.rs
@@ -2,11 +2,11 @@ use anyhow::Result;
 use sp1_core_executor::SP1Context;
 use sp1_core_machine::io::SP1Stdin;
 use sp1_prover::{components::DefaultProverComponents, SP1Prover};
+use sp1_stark::MachineProver;
 
-use crate::install::try_install_circuit_artifacts;
 use crate::{
-    provers::ProofOpts, Prover, SP1Proof, SP1ProofKind, SP1ProofWithPublicValues, SP1ProvingKey,
-    SP1VerifyingKey,
+    install::try_install_circuit_artifacts, provers::ProofOpts, Prover, SP1Proof, SP1ProofKind,
+    SP1ProofWithPublicValues, SP1ProvingKey, SP1VerifyingKey,
 };
 
 use super::ProverType;
@@ -35,7 +35,8 @@ impl Prover<DefaultProverComponents> for CpuProver {
     }
 
     fn setup(&self, elf: &[u8]) -> (SP1ProvingKey, SP1VerifyingKey) {
-        self.prover.setup(elf)
+        let (pkey, _, _, vk) = self.prover.setup(elf);
+        (pkey, vk)
     }
 
     fn sp1_prover(&self) -> &SP1Prover<DefaultProverComponents> {
@@ -51,8 +52,11 @@ impl Prover<DefaultProverComponents> for CpuProver {
         kind: SP1ProofKind,
     ) -> Result<SP1ProofWithPublicValues> {
         // Generate the core proof.
+        let program = self.prover.get_program(&pk.elf).unwrap();
+        let pk_d = self.prover.core_prover.pk_to_device(&pk.pk);
+
         let proof: sp1_prover::SP1ProofWithMetadata<sp1_prover::SP1CoreProofData> =
-            self.prover.prove_core(pk, &stdin, opts.sp1_prover_opts, context)?;
+            self.prover.prove_core(&pk_d, program, &stdin, opts.sp1_prover_opts, context)?;
         if kind == SP1ProofKind::Core {
             return Ok(SP1ProofWithPublicValues {
                 proof: SP1Proof::Core(proof.proof.0),
diff --git a/crates/sdk/src/provers/cuda.rs b/crates/sdk/src/provers/cuda.rs
index 5f8ab983aa..251eb00fb3 100644
--- a/crates/sdk/src/provers/cuda.rs
+++ b/crates/sdk/src/provers/cuda.rs
@@ -4,10 +4,9 @@ use sp1_cuda::SP1CudaProver;
 use sp1_prover::{components::DefaultProverComponents, SP1Prover};
 
 use super::ProverType;
-use crate::install::try_install_circuit_artifacts;
 use crate::{
-    provers::ProofOpts, Prover, SP1Context, SP1Proof, SP1ProofKind, SP1ProofWithPublicValues,
-    SP1ProvingKey, SP1VerifyingKey,
+    install::try_install_circuit_artifacts, provers::ProofOpts, Prover, SP1Context, SP1Proof,
+    SP1ProofKind, SP1ProofWithPublicValues, SP1ProvingKey, SP1VerifyingKey,
 };
 
 /// An implementation of [crate::ProverClient] that can generate proofs locally using CUDA.
@@ -30,7 +29,8 @@ impl Prover<DefaultProverComponents> for CudaProver {
     }
 
     fn setup(&self, elf: &[u8]) -> (SP1ProvingKey, SP1VerifyingKey) {
-        self.prover.setup(elf)
+        let (pk, _, _, vk) = self.prover.setup(elf);
+        (pk, vk)
     }
 
     fn sp1_prover(&self) -> &SP1Prover<DefaultProverComponents> {
@@ -48,7 +48,8 @@ impl Prover<DefaultProverComponents> for CudaProver {
         tracing::warn!("opts and context are ignored for the cuda prover");
 
         // Generate the core proof.
-        let proof = self.cuda_prover.prove_core(pk, &stdin)?;
+        let (_, _) = self.cuda_prover.setup(&pk.elf).unwrap();
+        let proof = self.cuda_prover.prove_core(&stdin)?;
         if kind == SP1ProofKind::Core {
             return Ok(SP1ProofWithPublicValues {
                 proof: SP1Proof::Core(proof.proof.0),
diff --git a/crates/sdk/src/provers/mock.rs b/crates/sdk/src/provers/mock.rs
index ca317972ac..efecdee539 100644
--- a/crates/sdk/src/provers/mock.rs
+++ b/crates/sdk/src/provers/mock.rs
@@ -2,7 +2,9 @@
 use hashbrown::HashMap;
 use sp1_core_executor::{SP1Context, SP1ReduceProof};
 use sp1_core_machine::io::SP1Stdin;
-use sp1_stark::{ShardCommitment, ShardOpenedValues, ShardProof, StarkVerifyingKey};
+use sp1_stark::{
+    septic_digest::SepticDigest, ShardCommitment, ShardOpenedValues, ShardProof, StarkVerifyingKey,
+};
 
 use crate::{
     Prover, SP1Proof, SP1ProofKind, SP1ProofWithPublicValues, SP1ProvingKey, SP1VerificationError,
@@ -39,7 +41,8 @@ impl Prover<DefaultProverComponents> for MockProver {
     }
 
     fn setup(&self, elf: &[u8]) -> (SP1ProvingKey, SP1VerifyingKey) {
-        self.prover.setup(elf)
+        let (pk, _, _, vk) = self.prover.setup(elf);
+        (pk, vk)
     }
 
     fn sp1_prover(&self) -> &SP1Prover {
@@ -69,8 +72,7 @@ impl Prover<DefaultProverComponents> for MockProver {
 
                 let shard_proof = ShardProof {
                     commitment: ShardCommitment {
-                        global_main_commit: [BabyBear::zero(); 8].into(),
-                        local_main_commit: [BabyBear::zero(); 8].into(),
+                        main_commit: [BabyBear::zero(); 8].into(),
                         permutation_commit: [BabyBear::zero(); 8].into(),
                         quotient_commit: [BabyBear::zero(); 8].into(),
                     },
@@ -91,6 +93,7 @@ impl Prover<DefaultProverComponents> for MockProver {
                 let reduce_vk = StarkVerifyingKey {
                     commit: [BabyBear::zero(); 8].into(),
                     pc_start: BabyBear::zero(),
+                    initial_global_cumulative_sum: SepticDigest::<BabyBear>::zero(),
                     chip_information: vec![],
                     chip_ordering: HashMap::new(),
                 };
diff --git a/crates/sdk/src/provers/mod.rs b/crates/sdk/src/provers/mod.rs
index 1db5309fac..626c5cdb30 100644
--- a/crates/sdk/src/provers/mod.rs
+++ b/crates/sdk/src/provers/mod.rs
@@ -10,8 +10,7 @@ pub use mock::MockProver;
 
 use itertools::Itertools;
 use p3_field::PrimeField32;
-use std::borrow::Borrow;
-use std::time::Duration;
+use std::{borrow::Borrow, time::Duration};
 
 use anyhow::Result;
 use sp1_core_executor::SP1Context;
@@ -24,8 +23,9 @@ use sp1_stark::{air::PublicValues, MachineVerificationError, SP1ProverOpts, Word
 use strum_macros::EnumString;
 use thiserror::Error;
 
-use crate::install::try_install_circuit_artifacts;
-use crate::{SP1Proof, SP1ProofKind, SP1ProofWithPublicValues};
+use crate::{
+    install::try_install_circuit_artifacts, SP1Proof, SP1ProofKind, SP1ProofWithPublicValues,
+};
 
 /// The type of prover.
 #[derive(Debug, PartialEq, EnumString)]
diff --git a/crates/stark/Cargo.toml b/crates/stark/Cargo.toml
index f1b5891a0a..b8f47c0b22 100644
--- a/crates/stark/Cargo.toml
+++ b/crates/stark/Cargo.toml
@@ -37,6 +37,8 @@ itertools = { workspace = true }
 tracing = { workspace = true }
 rayon-scan = "0.1.1"
 arrayref = "0.3.8"
+num-bigint = { version = "0.4.3", default-features = false }
+
 strum = "0.26.3"
 strum_macros = "0.26.4"
 sysinfo = "0.30.13"
diff --git a/crates/stark/src/air/builder.rs b/crates/stark/src/air/builder.rs
index dc89f80d2e..8d1c6fb7b4 100644
--- a/crates/stark/src/air/builder.rs
+++ b/crates/stark/src/air/builder.rs
@@ -10,7 +10,9 @@ use serde::{Deserialize, Serialize};
 use strum_macros::{Display, EnumIter};
 
 use super::{interaction::AirInteraction, BinomialExtension};
-use crate::{lookup::InteractionKind, Word};
+use crate::{
+    lookup::InteractionKind, septic_digest::SepticDigest, septic_extension::SepticExtension, Word,
+};
 
 /// The scope of an interaction.
 #[derive(
@@ -186,7 +188,6 @@ pub trait AluAirBuilder: BaseAirBuilder {
         b: Word<impl Into<Self::Expr>>,
         c: Word<impl Into<Self::Expr>>,
         shard: impl Into<Self::Expr>,
-        nonce: impl Into<Self::Expr>,
         multiplicity: impl Into<Self::Expr>,
     ) {
         let values = once(opcode.into())
@@ -194,7 +195,6 @@ pub trait AluAirBuilder: BaseAirBuilder {
             .chain(b.0.into_iter().map(Into::into))
             .chain(c.0.into_iter().map(Into::into))
             .chain(once(shard.into()))
-            .chain(once(nonce.into()))
             .collect();
 
         self.send(
@@ -212,7 +212,6 @@ pub trait AluAirBuilder: BaseAirBuilder {
         b: Word<impl Into<Self::Expr>>,
         c: Word<impl Into<Self::Expr>>,
         shard: impl Into<Self::Expr>,
-        nonce: impl Into<Self::Expr>,
         multiplicity: impl Into<Self::Expr>,
     ) {
         let values = once(opcode.into())
@@ -220,7 +219,6 @@ pub trait AluAirBuilder: BaseAirBuilder {
             .chain(b.0.into_iter().map(Into::into))
             .chain(c.0.into_iter().map(Into::into))
             .chain(once(shard.into()))
-            .chain(once(nonce.into()))
             .collect();
 
         self.receive(
@@ -235,7 +233,6 @@ pub trait AluAirBuilder: BaseAirBuilder {
         &mut self,
         shard: impl Into<Self::Expr> + Clone,
         clk: impl Into<Self::Expr> + Clone,
-        nonce: impl Into<Self::Expr> + Clone,
         syscall_id: impl Into<Self::Expr> + Clone,
         arg1: impl Into<Self::Expr> + Clone,
         arg2: impl Into<Self::Expr> + Clone,
@@ -247,7 +244,6 @@ pub trait AluAirBuilder: BaseAirBuilder {
                 vec![
                     shard.clone().into(),
                     clk.clone().into(),
-                    nonce.clone().into(),
                     syscall_id.clone().into(),
                     arg1.clone().into(),
                     arg2.clone().into(),
@@ -265,7 +261,6 @@ pub trait AluAirBuilder: BaseAirBuilder {
         &mut self,
         shard: impl Into<Self::Expr> + Clone,
         clk: impl Into<Self::Expr> + Clone,
-        nonce: impl Into<Self::Expr> + Clone,
         syscall_id: impl Into<Self::Expr> + Clone,
         arg1: impl Into<Self::Expr> + Clone,
         arg2: impl Into<Self::Expr> + Clone,
@@ -277,7 +272,6 @@ pub trait AluAirBuilder: BaseAirBuilder {
                 vec![
                     shard.clone().into(),
                     clk.clone().into(),
-                    nonce.clone().into(),
                     syscall_id.clone().into(),
                     arg1.clone().into(),
                     arg2.clone().into(),
@@ -328,19 +322,39 @@ pub trait ExtensionAirBuilder: BaseAirBuilder {
     }
 }
 
+/// A builder that can operation on septic extension elements.
+pub trait SepticExtensionAirBuilder: BaseAirBuilder {
+    /// Asserts that the two field extensions are equal.
+    fn assert_septic_ext_eq<I: Into<Self::Expr>>(
+        &mut self,
+        left: SepticExtension<I>,
+        right: SepticExtension<I>,
+    ) {
+        for (left, right) in left.0.into_iter().zip(right.0) {
+            self.assert_eq(left, right);
+        }
+    }
+}
+
 /// A builder that implements a permutation argument.
 pub trait MultiTableAirBuilder<'a>: PermutationAirBuilder {
-    /// The type of the cumulative sum.
-    type Sum: Into<Self::ExprEF> + Copy;
+    /// The type of the local cumulative sum.
+    type LocalSum: Into<Self::ExprEF> + Copy;
+
+    /// The type of the global cumulative sum;
+    type GlobalSum: Into<Self::Expr> + Copy;
+
+    /// Returns the local cumulative sum of the permutation.
+    fn local_cumulative_sum(&self) -> &'a Self::LocalSum;
 
-    /// Returns the cumulative sum of the permutation.
-    fn cumulative_sums(&self) -> &'a [Self::Sum];
+    /// Returns the global cumulative sum of the permutation.
+    fn global_cumulative_sum(&self) -> &'a SepticDigest<Self::GlobalSum>;
 }
 
 /// A trait that contains the common helper methods for building `SP1 recursion` and SP1 machine
 /// AIRs.
 pub trait MachineAirBuilder:
-    BaseAirBuilder + ExtensionAirBuilder + AirBuilderWithPublicValues
+    BaseAirBuilder + ExtensionAirBuilder + SepticExtensionAirBuilder + AirBuilderWithPublicValues
 {
 }
 
@@ -362,6 +376,7 @@ impl<AB: BaseAirBuilder> ByteAirBuilder for AB {}
 impl<AB: BaseAirBuilder> AluAirBuilder for AB {}
 
 impl<AB: BaseAirBuilder> ExtensionAirBuilder for AB {}
+impl<AB: BaseAirBuilder> SepticExtensionAirBuilder for AB {}
 impl<AB: BaseAirBuilder + AirBuilderWithPublicValues> MachineAirBuilder for AB {}
 impl<AB: BaseAirBuilder + AirBuilderWithPublicValues> SP1AirBuilder for AB {}
 
diff --git a/crates/stark/src/air/machine.rs b/crates/stark/src/air/machine.rs
index 0a9b0af4a8..2398e19858 100644
--- a/crates/stark/src/air/machine.rs
+++ b/crates/stark/src/air/machine.rs
@@ -2,7 +2,7 @@ use p3_air::BaseAir;
 use p3_field::Field;
 use p3_matrix::dense::RowMajorMatrix;
 
-use crate::MachineRecord;
+use crate::{septic_digest::SepticDigest, MachineRecord};
 
 pub use sp1_derive::MachineAir;
 
@@ -59,4 +59,6 @@ pub trait MachineAir<F: Field>: BaseAir<F> + 'static + Send + Sync {
 pub trait MachineProgram<F>: Send + Sync {
     /// Gets the starting program counter.
     fn pc_start(&self) -> F;
+    /// Gets the initial global cumulative sum.
+    fn initial_global_cumulative_sum(&self) -> SepticDigest<F>;
 }
diff --git a/crates/stark/src/chip.rs b/crates/stark/src/chip.rs
index 2d627d2ccd..eced8524f9 100644
--- a/crates/stark/src/chip.rs
+++ b/crates/stark/src/chip.rs
@@ -7,24 +7,26 @@ use p3_uni_stark::{get_max_constraint_degree, SymbolicAirBuilder};
 use p3_util::log2_ceil_usize;
 
 use crate::{
-    air::{MachineAir, MultiTableAirBuilder, SP1AirBuilder},
+    air::{InteractionScope, MachineAir, MultiTableAirBuilder, SP1AirBuilder},
+    local_permutation_trace_width,
     lookup::{Interaction, InteractionBuilder, InteractionKind},
 };
 
 use super::{
-    eval_permutation_constraints, generate_permutation_trace, get_grouped_maps, PROOF_MAX_NUM_PVS,
+    eval_permutation_constraints, generate_permutation_trace, scoped_interactions,
+    PROOF_MAX_NUM_PVS,
 };
 
 /// An Air that encodes lookups based on interactions.
 pub struct Chip<F: Field, A> {
     /// The underlying AIR of the chip for constraint evaluation.
-    air: A,
+    pub air: A,
     /// The interactions that the chip sends.
-    sends: Vec<Interaction<F>>,
+    pub sends: Vec<Interaction<F>>,
     /// The interactions that the chip receives.
-    receives: Vec<Interaction<F>>,
+    pub receives: Vec<Interaction<F>>,
     /// The relative log degree of the quotient polynomial, i.e. `log2(max_constraint_degree - 1)`.
-    log_quotient_degree: usize,
+    pub log_quotient_degree: usize,
 }
 
 impl<F: Field, A> Chip<F, A> {
@@ -119,13 +121,13 @@ where
         preprocessed: Option<&RowMajorMatrix<F>>,
         main: &RowMajorMatrix<F>,
         random_elements: &[EF],
-    ) -> (RowMajorMatrix<EF>, EF, EF)
+    ) -> (RowMajorMatrix<EF>, EF)
     where
         F: PrimeField,
         A: MachineAir<F>,
     {
         let batch_size = self.logup_batch_size();
-        generate_permutation_trace(
+        generate_permutation_trace::<F, EF>(
             &self.sends,
             &self.receives,
             preprocessed,
@@ -138,10 +140,15 @@ where
     /// Returns the width of the permutation trace.
     #[inline]
     pub fn permutation_width(&self) -> usize {
-        let (_, _, grouped_widths) =
-            get_grouped_maps(self.sends(), self.receives(), self.logup_batch_size());
-
-        grouped_widths.values().sum()
+        let (scoped_sends, scoped_receives) = scoped_interactions(self.sends(), self.receives());
+        let empty = Vec::new();
+        let local_sends = scoped_sends.get(&InteractionScope::Local).unwrap_or(&empty);
+        let local_receives = scoped_receives.get(&InteractionScope::Local).unwrap_or(&empty);
+
+        local_permutation_trace_width(
+            local_sends.len() + local_receives.len(),
+            self.logup_batch_size(),
+        )
     }
 
     /// Returns the cost of a row in the chip.
@@ -223,7 +230,7 @@ where
 impl<'a, F, A, AB> Air<AB> for Chip<F, A>
 where
     F: Field,
-    A: Air<AB>,
+    A: Air<AB> + MachineAir<F>,
     AB: SP1AirBuilder<F = F> + MultiTableAirBuilder<'a> + PairBuilder + 'a,
 {
     fn eval(&self, builder: &mut AB) {
@@ -231,7 +238,13 @@ where
         self.air.eval(builder);
         // Evaluate permutation constraints.
         let batch_size = self.logup_batch_size();
-        eval_permutation_constraints(&self.sends, &self.receives, batch_size, builder);
+        eval_permutation_constraints(
+            &self.sends,
+            &self.receives,
+            batch_size,
+            self.air.commit_scope(),
+            builder,
+        );
     }
 }
 
diff --git a/crates/stark/src/debug.rs b/crates/stark/src/debug.rs
index 78a7d0c95c..8808458165 100644
--- a/crates/stark/src/debug.rs
+++ b/crates/stark/src/debug.rs
@@ -14,11 +14,13 @@ use p3_matrix::{
     stack::VerticalPair,
     Matrix,
 };
-use p3_maybe_rayon::prelude::ParallelBridge;
-use p3_maybe_rayon::prelude::ParallelIterator;
+use p3_maybe_rayon::prelude::{ParallelBridge, ParallelIterator};
 
 use super::{MachineChip, StarkGenericConfig, Val};
-use crate::air::{EmptyMessageBuilder, MachineAir, MultiTableAirBuilder};
+use crate::{
+    air::{EmptyMessageBuilder, MachineAir, MultiTableAirBuilder},
+    septic_digest::SepticDigest,
+};
 
 /// Checks that the constraints of the given AIR are satisfied, including the permutation trace.
 ///
@@ -31,7 +33,8 @@ pub fn debug_constraints<SC, A>(
     perm: &RowMajorMatrix<SC::Challenge>,
     perm_challenges: &[SC::Challenge],
     public_values: &[Val<SC>],
-    cumulative_sums: &[SC::Challenge],
+    local_cumulative_sum: &SC::Challenge,
+    global_cumulative_sum: &SepticDigest<Val<SC>>,
 ) where
     SC: StarkGenericConfig,
     Val<SC>: PrimeField32,
@@ -84,7 +87,8 @@ pub fn debug_constraints<SC, A>(
                 RowMajorMatrixView::new_row(perm_next),
             ),
             perm_challenges,
-            cumulative_sums,
+            local_cumulative_sum,
+            global_cumulative_sum,
             is_first_row: Val::<SC>::zero(),
             is_last_row: Val::<SC>::zero(),
             is_transition: Val::<SC>::one(),
@@ -130,7 +134,8 @@ pub struct DebugConstraintBuilder<'a, F: Field, EF: ExtensionField<F>> {
     pub(crate) preprocessed: VerticalPair<RowMajorMatrixView<'a, F>, RowMajorMatrixView<'a, F>>,
     pub(crate) main: VerticalPair<RowMajorMatrixView<'a, F>, RowMajorMatrixView<'a, F>>,
     pub(crate) perm: VerticalPair<RowMajorMatrixView<'a, EF>, RowMajorMatrixView<'a, EF>>,
-    pub(crate) cumulative_sums: &'a [EF],
+    pub(crate) local_cumulative_sum: &'a EF,
+    pub(crate) global_cumulative_sum: &'a SepticDigest<F>,
     pub(crate) perm_challenges: &'a [EF],
     pub(crate) is_first_row: F,
     pub(crate) is_last_row: F,
@@ -257,10 +262,15 @@ where
     F: Field,
     EF: ExtensionField<F>,
 {
-    type Sum = EF;
+    type LocalSum = EF;
+    type GlobalSum = F;
+
+    fn local_cumulative_sum(&self) -> &'a Self::LocalSum {
+        self.local_cumulative_sum
+    }
 
-    fn cumulative_sums(&self) -> &'a [Self::Sum] {
-        self.cumulative_sums
+    fn global_cumulative_sum(&self) -> &'a SepticDigest<Self::GlobalSum> {
+        self.global_cumulative_sum
     }
 }
 
diff --git a/crates/stark/src/folder.rs b/crates/stark/src/folder.rs
index 4666e2e94c..e6688e26c4 100644
--- a/crates/stark/src/folder.rs
+++ b/crates/stark/src/folder.rs
@@ -7,7 +7,10 @@ use p3_field::{AbstractField, ExtensionField, Field};
 use p3_matrix::{dense::RowMajorMatrixView, stack::VerticalPair};
 
 use super::{Challenge, PackedChallenge, PackedVal, StarkGenericConfig, Val};
-use crate::air::{EmptyMessageBuilder, MultiTableAirBuilder};
+use crate::{
+    air::{EmptyMessageBuilder, MultiTableAirBuilder},
+    septic_digest::SepticDigest,
+};
 use p3_air::{
     AirBuilder, AirBuilderWithPublicValues, ExtensionBuilder, PairBuilder, PermutationAirBuilder,
 };
@@ -27,8 +30,10 @@ pub struct ProverConstraintFolder<'a, SC: StarkGenericConfig> {
     >,
     /// The challenges for the permutation.
     pub perm_challenges: &'a [PackedChallenge<SC>],
-    /// The cumulative sums for the permutation.
-    pub cumulative_sums: &'a [PackedChallenge<SC>],
+    /// The local cumulative sum for the permutation.
+    pub local_cumulative_sum: &'a PackedChallenge<SC>,
+    /// The global cumulative sum for the permutation.
+    pub global_cumulative_sum: &'a SepticDigest<Val<SC>>,
     /// The selector for the first row.
     pub is_first_row: PackedVal<SC>,
     /// The selector for the last row.
@@ -112,10 +117,15 @@ impl<'a, SC: StarkGenericConfig> PermutationAirBuilder for ProverConstraintFolde
 }
 
 impl<'a, SC: StarkGenericConfig> MultiTableAirBuilder<'a> for ProverConstraintFolder<'a, SC> {
-    type Sum = PackedChallenge<SC>;
+    type LocalSum = PackedChallenge<SC>;
+    type GlobalSum = Val<SC>;
+
+    fn local_cumulative_sum(&self) -> &'a Self::LocalSum {
+        self.local_cumulative_sum
+    }
 
-    fn cumulative_sums(&self) -> &'a [Self::Sum] {
-        self.cumulative_sums
+    fn global_cumulative_sum(&self) -> &'a SepticDigest<Self::GlobalSum> {
+        self.global_cumulative_sum
     }
 }
 
@@ -155,8 +165,10 @@ pub struct GenericVerifierConstraintFolder<'a, F, EF, PubVar, Var, Expr> {
     pub perm: VerticalPair<RowMajorMatrixView<'a, Var>, RowMajorMatrixView<'a, Var>>,
     /// The challenges for the permutation.
     pub perm_challenges: &'a [Var],
-    /// The cumulative sums of the permutation.
-    pub cumulative_sums: &'a [Var],
+    /// The local cumulative sum of the permutation.
+    pub local_cumulative_sum: &'a Var,
+    /// The global cumulative sum of the permutation.
+    pub global_cumulative_sum: &'a SepticDigest<PubVar>,
     /// The selector for the first row.
     pub is_first_row: Var,
     /// The selector for the last row.
@@ -345,10 +357,15 @@ where
         + Sync,
     PubVar: Into<Expr> + Copy,
 {
-    type Sum = Var;
+    type LocalSum = Var;
+    type GlobalSum = PubVar;
+
+    fn local_cumulative_sum(&self) -> &'a Self::LocalSum {
+        self.local_cumulative_sum
+    }
 
-    fn cumulative_sums(&self) -> &'a [Self::Sum] {
-        self.cumulative_sums
+    fn global_cumulative_sum(&self) -> &'a SepticDigest<Self::GlobalSum> {
+        self.global_cumulative_sum
     }
 }
 
diff --git a/crates/stark/src/lib.rs b/crates/stark/src/lib.rs
index 924dcb3356..539d2dc886 100644
--- a/crates/stark/src/lib.rs
+++ b/crates/stark/src/lib.rs
@@ -33,6 +33,9 @@ mod permutation;
 mod prover;
 mod quotient;
 mod record;
+pub mod septic_curve;
+pub mod septic_digest;
+pub mod septic_extension;
 mod types;
 mod util;
 mod verifier;
diff --git a/crates/stark/src/machine.rs b/crates/stark/src/machine.rs
index 5a5f5ea8db..28724396a3 100644
--- a/crates/stark/src/machine.rs
+++ b/crates/stark/src/machine.rs
@@ -1,3 +1,6 @@
+use crate::{
+    septic_curve::SepticCurve, septic_digest::SepticDigest, septic_extension::SepticExtension,
+};
 use hashbrown::HashMap;
 use itertools::Itertools;
 use p3_air::Air;
@@ -7,7 +10,7 @@ use p3_field::{AbstractExtensionField, AbstractField, Field, PrimeField32};
 use p3_matrix::{dense::RowMajorMatrix, Dimensions, Matrix};
 use p3_maybe_rayon::prelude::*;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{array, cmp::Reverse, env, fmt::Debug, time::Instant};
+use std::{cmp::Reverse, env, fmt::Debug, iter::once, time::Instant};
 use tracing::instrument;
 
 use super::{debug_constraints, Dom};
@@ -60,6 +63,8 @@ pub struct StarkProvingKey<SC: StarkGenericConfig> {
     pub commit: Com<SC>,
     /// The start pc of the program.
     pub pc_start: Val<SC>,
+    /// The starting global digest of the program, after incorporating the initial memory.
+    pub initial_global_cumulative_sum: SepticDigest<Val<SC>>,
     /// The preprocessed traces.
     pub traces: Vec<RowMajorMatrix<Val<SC>>>,
     /// The pcs data for the preprocessed traces.
@@ -75,9 +80,10 @@ impl<SC: StarkGenericConfig> StarkProvingKey<SC> {
     pub fn observe_into(&self, challenger: &mut SC::Challenger) {
         challenger.observe(self.commit.clone());
         challenger.observe(self.pc_start);
-        for _ in 0..7 {
-            challenger.observe(Val::<SC>::zero());
-        }
+        challenger.observe_slice(&self.initial_global_cumulative_sum.0.x.0);
+        challenger.observe_slice(&self.initial_global_cumulative_sum.0.y.0);
+        // Observe the padding.
+        challenger.observe(Val::<SC>::zero());
     }
 }
 
@@ -90,6 +96,8 @@ pub struct StarkVerifyingKey<SC: StarkGenericConfig> {
     pub commit: Com<SC>,
     /// The start pc of the program.
     pub pc_start: Val<SC>,
+    /// The starting global digest of the program, after incorporating the initial memory.
+    pub initial_global_cumulative_sum: SepticDigest<Val<SC>>,
     /// The chip information.
     pub chip_information: Vec<(String, Dom<SC>, Dimensions)>,
     /// The chip ordering.
@@ -101,9 +109,10 @@ impl<SC: StarkGenericConfig> StarkVerifyingKey<SC> {
     pub fn observe_into(&self, challenger: &mut SC::Challenger) {
         challenger.observe(self.commit.clone());
         challenger.observe(self.pc_start);
-        for _ in 0..7 {
-            challenger.observe(Val::<SC>::zero());
-        }
+        challenger.observe_slice(&self.initial_global_cumulative_sum.0.x.0);
+        challenger.observe_slice(&self.initial_global_cumulative_sum.0.y.0);
+        // Observe the padding.
+        challenger.observe(Val::<SC>::zero());
     }
 }
 
@@ -237,17 +246,25 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> StarkMachine<SC, A> {
             named_preprocessed_traces.into_iter().map(|(_, _, trace)| trace).collect::<Vec<_>>();
 
         let pc_start = program.pc_start();
+        let initial_global_cumulative_sum = program.initial_global_cumulative_sum();
 
         (
             StarkProvingKey {
                 commit: commit.clone(),
                 pc_start,
+                initial_global_cumulative_sum,
                 traces,
                 data,
                 chip_ordering: chip_ordering.clone(),
                 local_only,
             },
-            StarkVerifyingKey { commit, pc_start, chip_information, chip_ordering },
+            StarkVerifyingKey {
+                commit,
+                pc_start,
+                initial_global_cumulative_sum,
+                chip_information,
+                chip_ordering,
+            },
         )
     }
 
@@ -301,45 +318,28 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> StarkMachine<SC, A> {
         SC::Challenger: Clone,
         A: for<'a> Air<VerifierConstraintFolder<'a, SC>>,
     {
-        let contains_global_bus = self.contains_global_bus();
-
         // Observe the preprocessed commitment.
         vk.observe_into(challenger);
-        tracing::debug_span!("observe challenges for all shards").in_scope(|| {
-            proof.shard_proofs.iter().for_each(|shard_proof| {
-                if contains_global_bus {
-                    challenger.observe(shard_proof.commitment.global_main_commit.clone());
-                }
-                challenger.observe_slice(&shard_proof.public_values[0..self.num_pv_elts()]);
-            });
-        });
 
         // Verify the shard proofs.
         if proof.shard_proofs.is_empty() {
             return Err(MachineVerificationError::EmptyProof);
         }
 
-        // Obtain the challenges used for the global permutation argument.
-        let global_permutation_challenges: [SC::Challenge; 2] = array::from_fn(|_| {
-            if contains_global_bus {
-                challenger.sample_ext_element()
-            } else {
-                SC::Challenge::zero()
-            }
-        });
-
         tracing::debug_span!("verify shard proofs").in_scope(|| {
             for (i, shard_proof) in proof.shard_proofs.iter().enumerate() {
                 tracing::debug_span!("verifying shard", shard = i).in_scope(|| {
                     let chips =
                         self.shard_chips_ordered(&shard_proof.chip_ordering).collect::<Vec<_>>();
+                    let mut shard_challenger = challenger.clone();
+                    shard_challenger
+                        .observe_slice(&shard_proof.public_values[0..self.num_pv_elts()]);
                     Verifier::verify_shard(
                         &self.config,
                         vk,
                         &chips,
-                        &mut challenger.clone(),
+                        &mut shard_challenger,
                         shard_proof,
-                        &global_permutation_challenges,
                     )
                     .map_err(MachineVerificationError::InvalidShardProof)
                 })?;
@@ -353,8 +353,9 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> StarkMachine<SC, A> {
             let sum = proof
                 .shard_proofs
                 .iter()
-                .map(|proof| proof.cumulative_sum(InteractionScope::Global))
-                .sum::<SC::Challenge>();
+                .map(ShardProof::global_cumulative_sum)
+                .chain(once(vk.initial_global_cumulative_sum))
+                .sum::<SepticDigest<Val<SC>>>();
 
             if !sum.is_zero() {
                 return Err(MachineVerificationError::NonZeroCumulativeSum(
@@ -386,12 +387,9 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> StarkMachine<SC, A> {
             permutation_challenges.push(challenger.sample_ext_element());
         }
 
-        // Obtain the challenges used for the local permutation argument.
-        for _ in 0..2 {
-            permutation_challenges.push(challenger.sample_ext_element());
-        }
+        let mut global_cumulative_sums = Vec::new();
+        global_cumulative_sums.push(pk.initial_global_cumulative_sum);
 
-        let mut global_cumulative_sum = SC::Challenge::zero();
         for shard in records.iter() {
             // Filter the chips based on what is used.
             let chips = self.shard_chips(shard).collect::<Vec<_>>();
@@ -409,27 +407,40 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> StarkMachine<SC, A> {
 
             // Generate the permutation traces.
             let mut permutation_traces = Vec::with_capacity(chips.len());
-            let mut cumulative_sums = Vec::with_capacity(chips.len());
+            let mut chip_cumulative_sums = Vec::with_capacity(chips.len());
             tracing::debug_span!("generate permutation traces").in_scope(|| {
                 chips
                     .par_iter()
                     .zip(traces.par_iter_mut())
                     .map(|(chip, (main_trace, pre_trace))| {
-                        let (trace, global_sum, local_sum) = chip.generate_permutation_trace(
+                        let (trace, local_sum) = chip.generate_permutation_trace(
                             *pre_trace,
                             main_trace,
                             &permutation_challenges,
                         );
-                        (trace, [global_sum, local_sum])
+                        let global_sum = if chip.commit_scope() == InteractionScope::Local {
+                            SepticDigest::<Val<SC>>::zero()
+                        } else {
+                            let main_trace_size = main_trace.height() * main_trace.width();
+                            let last_row =
+                                &main_trace.values[main_trace_size - 14..main_trace_size];
+                            SepticDigest(SepticCurve {
+                                x: SepticExtension::<Val<SC>>::from_base_fn(|i| last_row[i]),
+                                y: SepticExtension::<Val<SC>>::from_base_fn(|i| last_row[i + 7]),
+                            })
+                        };
+                        (trace, (global_sum, local_sum))
                     })
-                    .unzip_into_vecs(&mut permutation_traces, &mut cumulative_sums);
+                    .unzip_into_vecs(&mut permutation_traces, &mut chip_cumulative_sums);
             });
 
-            global_cumulative_sum +=
-                cumulative_sums.iter().map(|sum| sum[0]).sum::<SC::Challenge>();
+            let global_cumulative_sum =
+                chip_cumulative_sums.iter().map(|sums| sums.0).sum::<SepticDigest<Val<SC>>>();
+            global_cumulative_sums.push(global_cumulative_sum);
 
             let local_cumulative_sum =
-                cumulative_sums.iter().map(|sum| sum[1]).sum::<SC::Challenge>();
+                chip_cumulative_sums.iter().map(|sums| sums.1).sum::<SC::Challenge>();
+
             if !local_cumulative_sum.is_zero() {
                 tracing::warn!("Local cumulative sum is not zero");
                 tracing::debug_span!("debug local interactions").in_scope(|| {
@@ -474,7 +485,8 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> StarkMachine<SC, A> {
                             &permutation_traces[i],
                             &permutation_challenges,
                             &shard.public_values(),
-                            &cumulative_sums[i],
+                            &chip_cumulative_sums[i].1,
+                            &chip_cumulative_sums[i].0,
                         );
                     }
                 });
@@ -483,6 +495,9 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> StarkMachine<SC, A> {
 
         tracing::info!("Constraints verified successfully");
 
+        let global_cumulative_sum: SepticDigest<Val<SC>> =
+            global_cumulative_sums.iter().copied().sum();
+
         // If the global cumulative sum is not zero, debug the interactions.
         if !global_cumulative_sum.is_zero() {
             tracing::warn!("Global cumulative sum is not zero");
diff --git a/crates/stark/src/opts.rs b/crates/stark/src/opts.rs
index 34ea0d81b7..a006d5f17e 100644
--- a/crates/stark/src/opts.rs
+++ b/crates/stark/src/opts.rs
@@ -155,7 +155,7 @@ impl SplitOpts {
             keccak: deferred_shift_threshold / 24,
             sha_extend: deferred_shift_threshold / 48,
             sha_compress: deferred_shift_threshold / 80,
-            memory: deferred_shift_threshold * 4,
+            memory: deferred_shift_threshold * 16,
         }
     }
 }
diff --git a/crates/stark/src/permutation.rs b/crates/stark/src/permutation.rs
index 1d0fa11574..58725f7123 100644
--- a/crates/stark/src/permutation.rs
+++ b/crates/stark/src/permutation.rs
@@ -1,35 +1,32 @@
-use std::borrow::Borrow;
-
+use crate::{
+    air::{InteractionScope, MultiTableAirBuilder},
+    lookup::Interaction,
+};
 use hashbrown::HashMap;
 use itertools::Itertools;
-use p3_air::{ExtensionBuilder, PairBuilder};
-use p3_field::{AbstractExtensionField, AbstractField, ExtensionField, Field, PrimeField};
+use p3_air::{AirBuilder, ExtensionBuilder, PairBuilder};
+use p3_field::AbstractExtensionField;
+use p3_field::AbstractField;
+use p3_field::{ExtensionField, Field, PrimeField};
 use p3_matrix::{dense::RowMajorMatrix, Matrix};
 use p3_maybe_rayon::prelude::*;
 use rayon_scan::ScanParallelIterator;
-use strum::IntoEnumIterator;
-
-use crate::{
-    air::{InteractionScope, MultiTableAirBuilder},
-    lookup::Interaction,
-};
+use std::borrow::Borrow;
 
-/// Computes the width of the permutation trace.
-#[inline]
+/// Computes the width of the local permutation trace in terms of extension field elements.
 #[must_use]
-pub const fn permutation_trace_width(num_interactions: usize, batch_size: usize) -> usize {
-    if num_interactions == 0 {
-        0
-    } else {
-        num_interactions.div_ceil(batch_size) + 1
+pub const fn local_permutation_trace_width(nb_interactions: usize, batch_size: usize) -> usize {
+    if nb_interactions == 0 {
+        return 0;
     }
+    nb_interactions.div_ceil(batch_size) + 1
 }
 
-/// Populates a permutation row.
+/// Populates a local permutation row.
 #[inline]
 #[allow(clippy::too_many_arguments)]
 #[allow(clippy::needless_pass_by_value)]
-pub fn populate_permutation_row<F: PrimeField, EF: ExtensionField<F>>(
+pub fn populate_local_permutation_row<F: PrimeField, EF: ExtensionField<F>>(
     row: &mut [EF],
     preprocessed_row: &[F],
     main_row: &[F],
@@ -39,9 +36,7 @@ pub fn populate_permutation_row<F: PrimeField, EF: ExtensionField<F>>(
     batch_size: usize,
 ) {
     let alpha = random_elements[0];
-
-    // Generate the RLC elements to uniquely identify each item in the looked up tuple.
-    let betas = random_elements[1].powers();
+    let betas = random_elements[1].powers(); // TODO: optimize
 
     let interaction_chunks = &sends
         .iter()
@@ -75,15 +70,11 @@ pub fn populate_permutation_row<F: PrimeField, EF: ExtensionField<F>>(
 
 /// Returns the sends, receives, and permutation trace width grouped by scope.
 #[allow(clippy::type_complexity)]
-pub fn get_grouped_maps<F: Field>(
+pub fn scoped_interactions<F: Field>(
     sends: &[Interaction<F>],
     receives: &[Interaction<F>],
-    batch_size: usize,
-) -> (
-    HashMap<InteractionScope, Vec<Interaction<F>>>,
-    HashMap<InteractionScope, Vec<Interaction<F>>>,
-    HashMap<InteractionScope, usize>,
-) {
+) -> (HashMap<InteractionScope, Vec<Interaction<F>>>, HashMap<InteractionScope, Vec<Interaction<F>>>)
+{
     // Create a hashmap of scope -> vec<send interactions>.
     let mut sends = sends.to_vec();
     sends.sort_by_key(|k| k.scope);
@@ -104,23 +95,11 @@ pub fn get_grouped_maps<F: Field>(
         .map(|(k, values)| (k, values.cloned().collect_vec()))
         .collect();
 
-    // Create a hashmap of scope -> permutation trace width.
-    let grouped_widths = InteractionScope::iter()
-        .map(|scope| {
-            let empty_vec = vec![];
-            let sends = grouped_sends.get(&scope).unwrap_or(&empty_vec);
-            let receives = grouped_receives.get(&scope).unwrap_or(&empty_vec);
-            (scope, permutation_trace_width(sends.len() + receives.len(), batch_size))
-        })
-        .collect();
-
-    (grouped_sends, grouped_receives, grouped_widths)
+    (grouped_sends, grouped_receives)
 }
 
 /// Generates the permutation trace for the given chip and main trace based on a variant of `LogUp`.
-///
-/// The permutation trace has `(N+1)*EF::NUM_COLS` columns, where N is the number of interactions in
-/// the chip.
+#[allow(clippy::too_many_lines)]
 pub fn generate_permutation_trace<F: PrimeField, EF: ExtensionField<F>>(
     sends: &[Interaction<F>],
     receives: &[Interaction<F>],
@@ -128,109 +107,81 @@ pub fn generate_permutation_trace<F: PrimeField, EF: ExtensionField<F>>(
     main: &RowMajorMatrix<F>,
     random_elements: &[EF],
     batch_size: usize,
-) -> (RowMajorMatrix<EF>, EF, EF) {
-    let (grouped_sends, grouped_receives, grouped_widths) =
-        get_grouped_maps(sends, receives, batch_size);
+) -> (RowMajorMatrix<EF>, EF) {
+    let empty = vec![];
+    let (scoped_sends, scoped_receives) = scoped_interactions(sends, receives);
+    let local_sends = scoped_sends.get(&InteractionScope::Local).unwrap_or(&empty);
+    let local_receives = scoped_receives.get(&InteractionScope::Local).unwrap_or(&empty);
+
+    let local_permutation_width =
+        local_permutation_trace_width(local_sends.len() + local_receives.len(), batch_size);
 
     let height = main.height();
-    let permutation_trace_width = grouped_widths.values().sum::<usize>();
+    let permutation_trace_width = local_permutation_width;
     let mut permutation_trace = RowMajorMatrix::new(
         vec![EF::zero(); permutation_trace_width * height],
         permutation_trace_width,
     );
 
-    let mut global_cumulative_sum = EF::zero();
     let mut local_cumulative_sum = EF::zero();
 
-    for scope in InteractionScope::iter() {
-        let empty_vec = vec![];
-        let sends = grouped_sends.get(&scope).unwrap_or(&empty_vec);
-        let receives = grouped_receives.get(&scope).unwrap_or(&empty_vec);
-
-        if sends.is_empty() && receives.is_empty() {
-            continue;
-        }
-
-        let random_elements = match scope {
-            InteractionScope::Global => &random_elements[0..2],
-            InteractionScope::Local => &random_elements[2..4],
-        };
-
-        let row_range = match scope {
-            InteractionScope::Global => {
-                0..*grouped_widths.get(&InteractionScope::Global).expect("Expected global scope")
-            }
-            InteractionScope::Local => {
-                let global_perm_width =
-                    *grouped_widths.get(&InteractionScope::Global).expect("Expected global scope");
-                let local_perm_width =
-                    *grouped_widths.get(&InteractionScope::Local).expect("Expected local scope");
-                global_perm_width..global_perm_width + local_perm_width
-            }
-        };
-
-        // Compute the permutation trace values in parallel.
-        match preprocessed {
-            Some(prep) => {
-                permutation_trace
-                    .par_rows_mut()
-                    .zip_eq(prep.par_row_slices())
-                    .zip_eq(main.par_row_slices())
-                    .for_each(|((row, prep_row), main_row)| {
-                        populate_permutation_row(
-                            &mut row[row_range.start..row_range.end],
-                            prep_row,
-                            main_row,
-                            sends,
-                            receives,
-                            random_elements,
-                            batch_size,
-                        );
-                    });
-            }
-            None => {
-                permutation_trace.par_rows_mut().zip_eq(main.par_row_slices()).for_each(
-                    |(row, main_row)| {
-                        populate_permutation_row(
-                            &mut row[row_range.start..row_range.end],
-                            &[],
-                            main_row,
-                            sends,
-                            receives,
-                            random_elements,
-                            batch_size,
-                        );
-                    },
-                );
-            }
+    let random_elements = &random_elements[0..2];
+    let local_row_range = 0..local_permutation_width;
+
+    if !local_sends.is_empty() || !local_receives.is_empty() {
+        if let Some(prep) = preprocessed {
+            permutation_trace
+                .par_rows_mut()
+                .zip_eq(prep.par_row_slices())
+                .zip_eq(main.par_row_slices())
+                .for_each(|((row, prep_row), main_row)| {
+                    populate_local_permutation_row::<F, EF>(
+                        &mut row[0..local_permutation_width],
+                        prep_row,
+                        main_row,
+                        local_sends,
+                        local_receives,
+                        random_elements,
+                        batch_size,
+                    );
+                });
+        } else {
+            permutation_trace.par_rows_mut().zip_eq(main.par_row_slices()).for_each(
+                |(row, main_row)| {
+                    populate_local_permutation_row::<F, EF>(
+                        &mut row[0..local_permutation_width],
+                        &[],
+                        main_row,
+                        local_sends,
+                        local_receives,
+                        random_elements,
+                        batch_size,
+                    );
+                },
+            );
         }
 
         let zero = EF::zero();
-        let cumulative_sums = permutation_trace
+        let local_cumulative_sums = permutation_trace
             .par_rows_mut()
-            .map(|row| row[row_range.start..row_range.end - 1].iter().copied().sum::<EF>())
+            .map(|row| {
+                row[local_row_range.start..local_row_range.end - 1].iter().copied().sum::<EF>()
+            })
             .collect::<Vec<_>>();
 
-        let cumulative_sums =
-            cumulative_sums.into_par_iter().scan(|a, b| *a + *b, zero).collect::<Vec<_>>();
+        let local_cumulative_sums =
+            local_cumulative_sums.into_par_iter().scan(|a, b| *a + *b, zero).collect::<Vec<_>>();
 
-        match scope {
-            InteractionScope::Global => {
-                global_cumulative_sum = *cumulative_sums.last().unwrap();
-            }
-            InteractionScope::Local => {
-                local_cumulative_sum = *cumulative_sums.last().unwrap();
-            }
-        }
+        local_cumulative_sum = *local_cumulative_sums.last().unwrap();
 
-        permutation_trace.par_rows_mut().zip_eq(cumulative_sums.clone().into_par_iter()).for_each(
-            |(row, cumulative_sum)| {
-                row[row_range.end - 1] = cumulative_sum;
+        permutation_trace.par_rows_mut().zip_eq(local_cumulative_sums.into_par_iter()).for_each(
+            |(row, local_cumulative_sum)| {
+                row[local_row_range.end - 1] = local_cumulative_sum;
             },
         );
     }
 
-    (permutation_trace, global_cumulative_sum, local_cumulative_sum)
+    (permutation_trace, local_cumulative_sum)
 }
 
 /// Evaluates the permutation constraints for the given chip.
@@ -238,12 +189,13 @@ pub fn generate_permutation_trace<F: PrimeField, EF: ExtensionField<F>>(
 /// In particular, the constraints checked here are:
 ///     - The running sum column starts at zero.
 ///     - That the RLC per interaction is computed correctly.
-///     - The running sum column ends at the (currently) given cumalitive sum.
+///     - The running sum column ends at the (currently) given cumulative sum.
 #[allow(clippy::too_many_lines)]
 pub fn eval_permutation_constraints<'a, F, AB>(
     sends: &[Interaction<F>],
     receives: &[Interaction<F>],
     batch_size: usize,
+    commit_scope: InteractionScope,
     builder: &mut AB,
 ) where
     F: Field,
@@ -251,15 +203,16 @@ pub fn eval_permutation_constraints<'a, F, AB>(
     AB: MultiTableAirBuilder<'a, F = F> + PairBuilder,
     AB: 'a,
 {
-    let (grouped_sends, grouped_receives, grouped_widths) =
-        get_grouped_maps(sends, receives, batch_size);
+    let empty = vec![];
+    let (scoped_sends, scoped_receives) = scoped_interactions(sends, receives);
+    let local_sends = scoped_sends.get(&InteractionScope::Local).unwrap_or(&empty);
+    let local_receives = scoped_receives.get(&InteractionScope::Local).unwrap_or(&empty);
+
+    let local_permutation_width =
+        local_permutation_trace_width(local_sends.len() + local_receives.len(), batch_size);
+
+    let permutation_trace_width = local_permutation_width;
 
-    // Get the permutation challenges.
-    let permutation_challenges = builder.permutation_randomness();
-    let random_elements: Vec<AB::ExprEF> =
-        permutation_challenges.iter().map(|x| (*x).into()).collect();
-    let cumulative_sums: Vec<AB::ExprEF> =
-        builder.cumulative_sums().iter().map(|x| (*x).into()).collect();
     let preprocessed = builder.preprocessed();
     let main = builder.main();
     let perm = builder.permutation().to_row_major_matrix();
@@ -268,67 +221,41 @@ pub fn eval_permutation_constraints<'a, F, AB>(
     let main_local = main.to_row_major_matrix();
     let main_local = main_local.row_slice(0);
     let main_local: &[AB::Var] = (*main_local).borrow();
-    let perm_width = perm.width();
     let perm_local = perm.row_slice(0);
     let perm_local: &[AB::VarEF] = (*perm_local).borrow();
     let perm_next = perm.row_slice(1);
     let perm_next: &[AB::VarEF] = (*perm_next).borrow();
+    let perm_width = perm.width();
 
     // Assert that the permutation trace width is correct.
-    let expected_perm_width = grouped_widths.values().sum::<usize>();
-    if perm_width != expected_perm_width {
+    if perm_width != permutation_trace_width {
         panic!(
-            "permutation trace width is incorrect: expected {expected_perm_width}, got {perm_width}",
+            "permutation trace width is incorrect: expected {permutation_trace_width}, got {perm_width}",
         );
     }
 
-    for scope in InteractionScope::iter() {
-        let random_elements = match scope {
-            InteractionScope::Global => &random_elements[0..2],
-            InteractionScope::Local => &random_elements[2..4],
-        };
-
-        let (alpha, beta) = (&random_elements[0], &random_elements[1]);
-
-        let perm_local = match scope {
-            InteractionScope::Global => &perm_local[0..*grouped_widths.get(&scope).unwrap()],
-            InteractionScope::Local => {
-                let global_perm_width = *grouped_widths.get(&InteractionScope::Global).unwrap();
-                &perm_local
-                    [global_perm_width..global_perm_width + *grouped_widths.get(&scope).unwrap()]
-            }
-        };
-
-        let perm_next = match scope {
-            InteractionScope::Global => &perm_next[0..*grouped_widths.get(&scope).unwrap()],
-            InteractionScope::Local => {
-                let global_perm_width = *grouped_widths.get(&InteractionScope::Global).unwrap();
-                &perm_next
-                    [global_perm_width..global_perm_width + *grouped_widths.get(&scope).unwrap()]
-            }
-        };
-
-        let empty_vec = vec![];
-        let sends = grouped_sends.get(&scope).unwrap_or(&empty_vec);
-        let receives = grouped_receives.get(&scope).unwrap_or(&empty_vec);
-
-        if sends.is_empty() && receives.is_empty() {
-            continue;
-        }
+    // Get the permutation challenges.
+    let permutation_challenges = builder.permutation_randomness();
+    let random_elements: Vec<AB::ExprEF> =
+        permutation_challenges.iter().map(|x| (*x).into()).collect();
+    let local_cumulative_sum = builder.local_cumulative_sum();
 
+    let random_elements = &random_elements[0..2];
+    let (alpha, beta) = (&random_elements[0], &random_elements[1]);
+    if !local_sends.is_empty() || !local_receives.is_empty() {
         // Ensure that each batch sum m_i/f_i is computed correctly.
-        let interaction_chunks = &sends
+        let interaction_chunks = &local_sends
             .iter()
             .map(|int| (int, true))
-            .chain(receives.iter().map(|int| (int, false)))
+            .chain(local_receives.iter().map(|int| (int, false)))
             .chunks(batch_size);
 
         // Assert that the i-eth entry is equal to the sum_i m_i/rlc_i by constraints:
-        // entry * \prod_i rlc_i = \sum_i m_i * \prod_{j!=i} rlc_j over all columns of the permutation
-        // trace except the last column.
+        // entry * \prod_i rlc_i = \sum_i m_i * \prod_{j!=i} rlc_j over all columns of the
+        // permutation trace except the last column.
         for (entry, chunk) in perm_local[0..perm_local.len() - 1].iter().zip(interaction_chunks) {
-            // First, we calculate the random linear combinations and multiplicities with the correct
-            // sign depending on wetther the interaction is a send or a receive.
+            // First, we calculate the random linear combinations and multiplicities with the
+            // correct sign depending on wetther the interaction is a send or a receive.
             let mut rlcs: Vec<AB::ExprEF> = Vec::with_capacity(batch_size);
             let mut multiplicities: Vec<AB::Expr> = Vec::with_capacity(batch_size);
             for (interaction, is_send) in chunk {
@@ -376,10 +303,14 @@ pub fn eval_permutation_constraints<'a, F, AB>(
         }
 
         // Compute the running local and next permutation sums.
-        let perm_width = grouped_widths.get(&scope).unwrap();
-        let sum_local =
-            perm_local[..perm_width - 1].iter().map(|x| (*x).into()).sum::<AB::ExprEF>();
-        let sum_next = perm_next[..perm_width - 1].iter().map(|x| (*x).into()).sum::<AB::ExprEF>();
+        let sum_local = perm_local[..local_permutation_width - 1]
+            .iter()
+            .map(|x| (*x).into())
+            .sum::<AB::ExprEF>();
+        let sum_next = perm_next[..local_permutation_width - 1]
+            .iter()
+            .map(|x| (*x).into())
+            .sum::<AB::ExprEF>();
         let phi_local: AB::ExprEF = (*perm_local.last().unwrap()).into();
         let phi_next: AB::ExprEF = (*perm_next.last().unwrap()).into();
 
@@ -389,13 +320,19 @@ pub fn eval_permutation_constraints<'a, F, AB>(
         // Assert that the cumulative sum is constrained to `phi_next - phi_local` on the transition
         // rows.
         builder.when_transition().assert_eq_ext(phi_next - phi_local.clone(), sum_next);
+        builder.when_last_row().assert_eq_ext(*perm_local.last().unwrap(), *local_cumulative_sum);
+    }
 
-        // Assert that the cumulative sum is constrained to `phi_local` on the last row.
-        let cumulative_sum = match scope {
-            InteractionScope::Global => &cumulative_sums[0],
-            InteractionScope::Local => &cumulative_sums[1],
-        };
-
-        builder.when_last_row().assert_eq_ext(*perm_local.last().unwrap(), cumulative_sum.clone());
+    // Handle global permutations.
+    let global_cumulative_sum = builder.global_cumulative_sum();
+    if commit_scope == InteractionScope::Global {
+        for i in 0..7 {
+            builder
+                .when_last_row()
+                .assert_eq(main_local[main_local.len() - 14 + i], global_cumulative_sum.0.x.0[i]);
+            builder
+                .when_last_row()
+                .assert_eq(main_local[main_local.len() - 7 + i], global_cumulative_sum.0.y.0[i]);
+        }
     }
 }
diff --git a/crates/stark/src/prover.rs b/crates/stark/src/prover.rs
index 95d3da46c5..ab4022fdc2 100644
--- a/crates/stark/src/prover.rs
+++ b/crates/stark/src/prover.rs
@@ -1,8 +1,10 @@
+use crate::septic_curve::SepticCurve;
+use crate::septic_digest::SepticDigest;
+use crate::septic_extension::SepticExtension;
 use core::fmt::Display;
-use hashbrown::HashMap;
 use itertools::Itertools;
 use serde::{de::DeserializeOwned, Serialize};
-use std::{array, cmp::Reverse, error::Error, time::Instant};
+use std::{cmp::Reverse, error::Error, time::Instant};
 
 use crate::{air::InteractionScope, AirOpenedValues, ChipOpenedValues, ShardOpenedValues};
 use p3_air::Air;
@@ -18,20 +20,11 @@ use super::{
     VerifierConstraintFolder,
 };
 use crate::{
-    air::MachineAir, config::ZeroCommitment, lookup::InteractionBuilder, opts::SP1CoreOpts,
-    record::MachineRecord, Challenger, DebugConstraintBuilder, MachineChip, MachineProof,
-    PackedChallenge, PcsProverData, ProverConstraintFolder, ShardCommitment, ShardMainData,
-    ShardProof, StarkVerifyingKey,
+    air::MachineAir, lookup::InteractionBuilder, opts::SP1CoreOpts, record::MachineRecord,
+    Challenger, DebugConstraintBuilder, MachineChip, MachineProof, PackedChallenge, PcsProverData,
+    ProverConstraintFolder, ShardCommitment, ShardMainData, ShardProof, StarkVerifyingKey,
 };
 
-/// A merged prover data item from the global and local prover data.
-pub struct MergedProverDataItem<'a, M> {
-    /// The trace.
-    pub trace: &'a M,
-    /// The main data index.
-    pub main_data_idx: usize,
-}
-
 /// An algorithmic & hardware independent prover implementation for any [`MachineAir`].
 pub trait MachineProver<SC: StarkGenericConfig, A: MachineAir<SC::Val>>:
     'static + Send + Sync
@@ -64,22 +57,13 @@ pub trait MachineProver<SC: StarkGenericConfig, A: MachineAir<SC::Val>>:
     fn pk_to_host(&self, pk: &Self::DeviceProvingKey) -> StarkProvingKey<SC>;
 
     /// Generate the main traces.
-    fn generate_traces(
-        &self,
-        record: &A::Record,
-        interaction_scope: InteractionScope,
-    ) -> Vec<(String, RowMajorMatrix<Val<SC>>)> {
+    fn generate_traces(&self, record: &A::Record) -> Vec<(String, RowMajorMatrix<Val<SC>>)> {
         let shard_chips = self.shard_chips(record).collect::<Vec<_>>();
-        let chips = shard_chips
-            .iter()
-            .filter(|chip| chip.commit_scope() == interaction_scope)
-            .collect::<Vec<_>>();
-        assert!(!chips.is_empty());
 
         // For each chip, generate the trace.
         let parent_span = tracing::debug_span!("generate traces for shard");
         parent_span.in_scope(|| {
-            chips
+            shard_chips
                 .par_iter()
                 .map(|chip| {
                     let chip_name = chip.name();
@@ -122,10 +106,8 @@ pub trait MachineProver<SC: StarkGenericConfig, A: MachineAir<SC::Val>>:
     fn open(
         &self,
         pk: &Self::DeviceProvingKey,
-        global_data: Option<ShardMainData<SC, Self::DeviceMatrix, Self::DeviceProverData>>,
-        local_data: ShardMainData<SC, Self::DeviceMatrix, Self::DeviceProverData>,
+        data: ShardMainData<SC, Self::DeviceMatrix, Self::DeviceProverData>,
         challenger: &mut SC::Challenger,
-        global_permutation_challenges: &[SC::Challenge],
     ) -> Result<ShardProof<SC>, Self::Error>;
 
     /// Generate a proof for the given records.
@@ -173,102 +155,6 @@ pub trait MachineProver<SC: StarkGenericConfig, A: MachineAir<SC::Val>>:
     {
         self.machine().debug_constraints(pk, records, challenger);
     }
-
-    /// Merge the global and local chips' sorted traces.
-    #[allow(clippy::type_complexity)]
-    fn merge_shard_traces<'a, 'b>(
-        &'a self,
-        global_traces: &'b [Self::DeviceMatrix],
-        global_chip_ordering: &'b HashMap<String, usize>,
-        local_traces: &'b [Self::DeviceMatrix],
-        local_chip_ordering: &'b HashMap<String, usize>,
-    ) -> (
-        HashMap<String, usize>,
-        Vec<InteractionScope>,
-        Vec<MergedProverDataItem<'b, Self::DeviceMatrix>>,
-    )
-    where
-        'a: 'b,
-    {
-        // Get the sort order of the chips.
-        let global_chips = global_chip_ordering
-            .iter()
-            .sorted_by_key(|(_, &i)| i)
-            .map(|chip| chip.0.clone())
-            .collect::<Vec<_>>();
-        let local_chips = local_chip_ordering
-            .iter()
-            .sorted_by_key(|(_, &i)| i)
-            .map(|chip| chip.0.clone())
-            .collect::<Vec<_>>();
-
-        let mut merged_chips = Vec::with_capacity(global_traces.len() + local_traces.len());
-        let mut merged_prover_data = Vec::with_capacity(global_chips.len() + local_chips.len());
-
-        assert!(global_traces.len() == global_chips.len());
-        let mut global_iter = global_traces.iter().zip(global_chips.iter()).enumerate();
-        assert!(local_traces.len() == local_chips.len());
-        let mut local_iter = local_traces.iter().zip(local_chips.iter()).enumerate();
-
-        let mut global_next = global_iter.next();
-        let mut local_next = local_iter.next();
-
-        let mut chip_scopes = Vec::new();
-
-        while global_next.is_some() || local_next.is_some() {
-            match (global_next, local_next) {
-                (Some(global), Some(local)) => {
-                    let (global_prover_data_idx, (global_trace, global_chip)) = global;
-                    let (local_prover_data_idx, (local_trace, local_chip)) = local;
-                    if (Reverse(global_trace.height()), global_chip)
-                        < (Reverse(local_trace.height()), local_chip)
-                    {
-                        merged_chips.push(global_chip.clone());
-                        chip_scopes.push(InteractionScope::Global);
-                        merged_prover_data.push(MergedProverDataItem {
-                            trace: global_trace,
-                            main_data_idx: global_prover_data_idx,
-                        });
-                        global_next = global_iter.next();
-                    } else {
-                        merged_chips.push(local_chip.clone());
-                        chip_scopes.push(InteractionScope::Local);
-                        merged_prover_data.push(MergedProverDataItem {
-                            trace: local_trace,
-                            main_data_idx: local_prover_data_idx,
-                        });
-                        local_next = local_iter.next();
-                    }
-                }
-                (Some(global), None) => {
-                    let (global_prover_data_idx, (global_trace, global_chip)) = global;
-                    merged_chips.push(global_chip.clone());
-                    chip_scopes.push(InteractionScope::Global);
-                    merged_prover_data.push(MergedProverDataItem {
-                        trace: global_trace,
-                        main_data_idx: global_prover_data_idx,
-                    });
-                    global_next = global_iter.next();
-                }
-                (None, Some(local)) => {
-                    let (local_prover_data_idx, (local_trace, local_chip)) = local;
-                    merged_chips.push(local_chip.clone());
-                    chip_scopes.push(InteractionScope::Local);
-                    merged_prover_data.push(MergedProverDataItem {
-                        trace: local_trace,
-                        main_data_idx: local_prover_data_idx,
-                    });
-                    local_next = local_iter.next();
-                }
-                (None, None) => break,
-            }
-        }
-
-        let chip_ordering =
-            merged_chips.iter().enumerate().map(|(i, name)| (name.clone(), i)).collect();
-
-        (chip_ordering, chip_scopes, merged_prover_data)
-    }
 }
 
 /// A proving key for any [`MachineAir`] that is agnostic to hardware.
@@ -279,6 +165,9 @@ pub trait MachineProvingKey<SC: StarkGenericConfig>: Send + Sync {
     /// The start pc.
     fn pc_start(&self) -> Val<SC>;
 
+    /// The initial global cumulative sum.
+    fn initial_global_cumulative_sum(&self) -> SepticDigest<Val<SC>>;
+
     /// Observe itself in the challenger.
     fn observe_into(&self, challenger: &mut Challenger<SC>);
 }
@@ -374,49 +263,15 @@ where
     fn open(
         &self,
         pk: &StarkProvingKey<SC>,
-        global_data: Option<ShardMainData<SC, Self::DeviceMatrix, Self::DeviceProverData>>,
-        local_data: ShardMainData<SC, Self::DeviceMatrix, Self::DeviceProverData>,
+        data: ShardMainData<SC, Self::DeviceMatrix, Self::DeviceProverData>,
         challenger: &mut <SC as StarkGenericConfig>::Challenger,
-        global_permutation_challenges: &[SC::Challenge],
     ) -> Result<ShardProof<SC>, Self::Error> {
-        let (global_traces, global_main_commit, global_main_data, global_chip_ordering) =
-            if let Some(global_data) = global_data {
-                let ShardMainData {
-                    traces: global_traces,
-                    main_commit: global_main_commit,
-                    main_data: global_main_data,
-                    chip_ordering: global_chip_ordering,
-                    public_values: _,
-                } = global_data;
-                (global_traces, global_main_commit, Some(global_main_data), global_chip_ordering)
-            } else {
-                (vec![], self.config().pcs().zero_commitment(), None, HashMap::new())
-            };
-
-        let ShardMainData {
-            traces: local_traces,
-            main_commit: local_main_commit,
-            main_data: local_main_data,
-            chip_ordering: local_chip_ordering,
-            public_values: local_public_values,
-        } = local_data;
-
-        // Merge the chip ordering and traces from the global and local data.
-        let (all_chips_ordering, all_chip_scopes, all_shard_data) = self.merge_shard_traces(
-            &global_traces,
-            &global_chip_ordering,
-            &local_traces,
-            &local_chip_ordering,
-        );
-
-        let chips = self.machine().shard_chips_ordered(&all_chips_ordering).collect::<Vec<_>>();
-
-        assert!(chips.len() == all_shard_data.len());
+        let chips = self.machine().shard_chips_ordered(&data.chip_ordering).collect::<Vec<_>>();
+        let traces = data.traces;
 
         let config = self.machine().config();
 
-        let degrees =
-            all_shard_data.iter().map(|shard_data| shard_data.trace.height()).collect::<Vec<_>>();
+        let degrees = traces.iter().map(|trace| trace.height()).collect::<Vec<_>>();
 
         let log_degrees =
             degrees.iter().map(|degree| log2_strict_usize(*degree)).collect::<Vec<_>>();
@@ -428,8 +283,9 @@ where
         let trace_domains =
             degrees.iter().map(|degree| pcs.natural_domain_for_degree(*degree)).collect::<Vec<_>>();
 
-        // Observe the main commitment.
-        challenger.observe(local_main_commit.clone());
+        // Observe the public values and the main commitment.
+        challenger.observe_slice(&data.public_values[0..self.num_pv_elts()]);
+        challenger.observe(data.main_commit.clone());
 
         // Obtain the challenges used for the local permutation argument.
         let mut local_permutation_challenges: Vec<SC::Challenge> = Vec::new();
@@ -437,41 +293,46 @@ where
             local_permutation_challenges.push(challenger.sample_ext_element());
         }
 
-        let permutation_challenges = global_permutation_challenges
+        let packed_perm_challenges = local_permutation_challenges
             .iter()
-            .chain(local_permutation_challenges.iter())
-            .copied()
-            .collect::<Vec<_>>();
-
-        let packed_perm_challenges = permutation_challenges
-            .iter()
-            .chain(local_permutation_challenges.iter())
             .map(|c| PackedChallenge::<SC>::from_f(*c))
             .collect::<Vec<_>>();
 
         // Generate the permutation traces.
-        let ((permutation_traces, prep_traces), cumulative_sums): ((Vec<_>, Vec<_>), Vec<_>) =
-            tracing::debug_span!("generate permutation traces").in_scope(|| {
-                chips
-                    .par_iter()
-                    .zip(all_shard_data.par_iter())
-                    .map(|(chip, shard_data)| {
-                        let preprocessed_trace =
-                            pk.chip_ordering.get(&chip.name()).map(|&index| &pk.traces[index]);
-                        let (perm_trace, global_sum, local_sum) = chip.generate_permutation_trace(
-                            preprocessed_trace,
-                            shard_data.trace,
-                            &permutation_challenges,
-                        );
-                        ((perm_trace, preprocessed_trace), [global_sum, local_sum])
-                    })
-                    .unzip()
-            });
+        let ((permutation_traces, prep_traces), (global_cumulative_sums, local_cumulative_sums)): (
+            (Vec<_>, Vec<_>),
+            (Vec<_>, Vec<_>),
+        ) = tracing::debug_span!("generate permutation traces").in_scope(|| {
+            chips
+                .par_iter()
+                .zip(traces.par_iter())
+                .map(|(chip, main_trace)| {
+                    let preprocessed_trace =
+                        pk.chip_ordering.get(&chip.name()).map(|&index| &pk.traces[index]);
+                    let (perm_trace, local_sum) = chip.generate_permutation_trace(
+                        preprocessed_trace,
+                        main_trace,
+                        &local_permutation_challenges,
+                    );
+                    let global_sum = if chip.commit_scope() == InteractionScope::Local {
+                        SepticDigest::<Val<SC>>::zero()
+                    } else {
+                        let main_trace_size = main_trace.height() * main_trace.width();
+                        let last_row = &main_trace.values[main_trace_size - 14..main_trace_size];
+                        SepticDigest(SepticCurve {
+                            x: SepticExtension::<Val<SC>>::from_base_fn(|i| last_row[i]),
+                            y: SepticExtension::<Val<SC>>::from_base_fn(|i| last_row[i + 7]),
+                        })
+                    };
+                    ((perm_trace, preprocessed_trace), (global_sum, local_sum))
+                })
+                .unzip()
+        });
 
         // Compute some statistics.
         for i in 0..chips.len() {
-            let trace_width = all_shard_data[i].trace.width();
-            let trace_height = all_shard_data[i].trace.height();
+            let trace_width = traces[i].width();
+            let trace_height = traces[i].height();
             let prep_width = prep_traces[i].map_or(0, |x| x.width());
             let permutation_width = permutation_traces[i].width();
             let total_width = trace_width
@@ -508,13 +369,15 @@ where
 
         // Observe the permutation commitment and cumulative sums.
         challenger.observe(permutation_commit.clone());
-        for [global_sum, local_sum] in cumulative_sums.iter() {
-            challenger.observe_slice(global_sum.as_base_slice());
+        for (local_sum, global_sum) in
+            local_cumulative_sums.iter().zip(global_cumulative_sums.iter())
+        {
             challenger.observe_slice(local_sum.as_base_slice());
+            challenger.observe_slice(&global_sum.0.x.0);
+            challenger.observe_slice(&global_sum.0.y.0);
         }
 
         // Compute the quotient polynomial for all chips.
-
         let quotient_domains = trace_domains
             .iter()
             .zip_eq(log_degrees.iter())
@@ -537,25 +400,18 @@ where
                             let preprocessed_trace_on_quotient_domains =
                                 pk.chip_ordering.get(&chips[i].name()).map(|&index| {
                                     pcs.get_evaluations_on_domain(&pk.data, index, *quotient_domain)
+                                        .to_row_major_matrix()
                                 });
-                            let scope = all_chip_scopes[i];
-                            let main_data = if scope == InteractionScope::Global {
-                                global_main_data
-                                    .as_ref()
-                                    .expect("Expected global_main_data to be Some")
-                            } else {
-                                &local_main_data
-                            };
-                            let main_trace_on_quotient_domains = pcs.get_evaluations_on_domain(
-                                main_data,
-                                all_shard_data[i].main_data_idx,
-                                *quotient_domain,
-                            );
+                            let main_trace_on_quotient_domains = pcs
+                                .get_evaluations_on_domain(&data.main_data, i, *quotient_domain)
+                                .to_row_major_matrix();
                             let permutation_trace_on_quotient_domains = pcs
-                                .get_evaluations_on_domain(&permutation_data, i, *quotient_domain);
+                                .get_evaluations_on_domain(&permutation_data, i, *quotient_domain)
+                                .to_row_major_matrix();
                             quotient_values(
                                 chips[i],
-                                &cumulative_sums[i],
+                                &local_cumulative_sums[i],
+                                &global_cumulative_sums[i],
                                 trace_domains[i],
                                 *quotient_domain,
                                 preprocessed_trace_on_quotient_domains,
@@ -563,7 +419,7 @@ where
                                 permutation_trace_on_quotient_domains,
                                 &packed_perm_challenges,
                                 alpha,
-                                &local_public_values,
+                                &data.public_values,
                             )
                         })
                 })
@@ -640,61 +496,22 @@ where
         let quotient_opening_points =
             (0..num_quotient_chunks).map(|_| vec![zeta]).collect::<Vec<_>>();
 
-        // Split the trace_opening_points to the global and local chips.
-        let mut global_trace_opening_points = Vec::with_capacity(global_chip_ordering.len());
-        let mut local_trace_opening_points = Vec::with_capacity(local_chip_ordering.len());
-        for (i, trace_opening_point) in main_trace_opening_points.clone().into_iter().enumerate() {
-            let scope = all_chip_scopes[i];
-            if scope == InteractionScope::Global {
-                global_trace_opening_points.push(trace_opening_point);
-            } else {
-                local_trace_opening_points.push(trace_opening_point);
-            }
-        }
-
-        let rounds = if let Some(global_main_data) = global_main_data.as_ref() {
-            vec![
-                (&pk.data, preprocessed_opening_points),
-                (global_main_data, global_trace_opening_points),
-                (&local_main_data, local_trace_opening_points),
-                (&permutation_data, permutation_trace_opening_points),
-                (&quotient_data, quotient_opening_points),
-            ]
-        } else {
-            vec![
-                (&pk.data, preprocessed_opening_points),
-                (&local_main_data, local_trace_opening_points),
-                (&permutation_data, permutation_trace_opening_points),
-                (&quotient_data, quotient_opening_points),
-            ]
-        };
-
-        let (openings, opening_proof) =
-            tracing::debug_span!("open multi batches").in_scope(|| pcs.open(rounds, challenger));
-
-        // Collect the opened values for each chip.
-        let (
-            preprocessed_values,
-            global_main_values,
-            local_main_values,
-            permutation_values,
-            mut quotient_values,
-        ) = if global_main_data.is_some() {
-            let [preprocessed_values, global_main_values, local_main_values, permutation_values, quotient_values] =
-                openings.try_into().unwrap();
-            (
-                preprocessed_values,
-                Some(global_main_values),
-                local_main_values,
-                permutation_values,
-                quotient_values,
+        let (openings, opening_proof) = tracing::debug_span!("open multi batches").in_scope(|| {
+            pcs.open(
+                vec![
+                    (&pk.data, preprocessed_opening_points),
+                    (&data.main_data, main_trace_opening_points.clone()),
+                    (&permutation_data, permutation_trace_opening_points.clone()),
+                    (&quotient_data, quotient_opening_points),
+                ],
+                challenger,
             )
-        } else {
-            let [preprocessed_values, local_main_values, permutation_values, quotient_values] =
-                openings.try_into().unwrap();
-            (preprocessed_values, None, local_main_values, permutation_values, quotient_values)
-        };
+        });
 
+        // Collect the opened values for each chip.
+        let [preprocessed_values, main_values, permutation_values, mut quotient_values] =
+            openings.try_into().unwrap();
+        assert!(main_values.len() == chips.len());
         let preprocessed_opened_values = preprocessed_values
             .into_iter()
             .zip(pk.local_only.iter())
@@ -710,30 +527,11 @@ where
             })
             .collect::<Vec<_>>();
 
-        // Merge the global and local main values.
-        let mut main_values =
-            Vec::with_capacity(global_chip_ordering.len() + local_chip_ordering.len());
-        for chip in chips.iter() {
-            let global_order = global_chip_ordering.get(&chip.name());
-            let local_order = local_chip_ordering.get(&chip.name());
-            match (global_order, local_order) {
-                (Some(&global_order), None) => {
-                    let global_main_values =
-                        global_main_values.as_ref().expect("Global main values should be Some");
-                    main_values.push((global_main_values[global_order].clone(), chip.local_only()));
-                }
-                (None, Some(&local_order)) => {
-                    main_values.push((local_main_values[local_order].clone(), chip.local_only()));
-                }
-                _ => unreachable!(),
-            }
-        }
-        assert!(main_values.len() == chips.len());
-
         let main_opened_values = main_values
             .into_iter()
-            .map(|(op, local_only)| {
-                if !local_only {
+            .zip(chips.iter())
+            .map(|(op, chip)| {
+                if !chip.local_only() {
                     let [local, next] = op.try_into().unwrap();
                     AirOpenedValues { local, next }
                 } else {
@@ -750,7 +548,6 @@ where
                 AirOpenedValues { local, next }
             })
             .collect::<Vec<_>>();
-
         let mut quotient_opened_values = Vec::with_capacity(log_quotient_degrees.len());
         for log_quotient_degree in log_quotient_degrees.iter() {
             let degree = 1 << *log_quotient_degree;
@@ -762,38 +559,49 @@ where
             .into_iter()
             .zip_eq(permutation_opened_values)
             .zip_eq(quotient_opened_values)
-            .zip_eq(cumulative_sums)
+            .zip_eq(local_cumulative_sums)
+            .zip_eq(global_cumulative_sums)
             .zip_eq(log_degrees.iter())
             .enumerate()
-            .map(|(i, ((((main, permutation), quotient), cumulative_sums), log_degree))| {
-                let preprocessed = pk
-                    .chip_ordering
-                    .get(&chips[i].name())
-                    .map(|&index| preprocessed_opened_values[index].clone())
-                    .unwrap_or(AirOpenedValues { local: vec![], next: vec![] });
-                ChipOpenedValues {
-                    preprocessed,
-                    main,
-                    permutation,
-                    quotient,
-                    global_cumulative_sum: cumulative_sums[0],
-                    local_cumulative_sum: cumulative_sums[1],
-                    log_degree: *log_degree,
-                }
-            })
+            .map(
+                |(
+                    i,
+                    (
+                        (
+                            (((main, permutation), quotient), local_cumulative_sum),
+                            global_cumulative_sum,
+                        ),
+                        log_degree,
+                    ),
+                )| {
+                    let preprocessed = pk
+                        .chip_ordering
+                        .get(&chips[i].name())
+                        .map(|&index| preprocessed_opened_values[index].clone())
+                        .unwrap_or(AirOpenedValues { local: vec![], next: vec![] });
+                    ChipOpenedValues {
+                        preprocessed,
+                        main,
+                        permutation,
+                        quotient,
+                        global_cumulative_sum,
+                        local_cumulative_sum,
+                        log_degree: *log_degree,
+                    }
+                },
+            )
             .collect::<Vec<_>>();
 
         Ok(ShardProof::<SC> {
             commitment: ShardCommitment {
-                global_main_commit,
-                local_main_commit,
+                main_commit: data.main_commit.clone(),
                 permutation_commit,
                 quotient_commit,
             },
             opened_values: ShardOpenedValues { chips: opened_values },
             opening_proof,
-            chip_ordering: all_chips_ordering,
-            public_values: local_public_values,
+            chip_ordering: data.chip_ordering,
+            public_values: data.public_values,
         })
     }
 
@@ -812,69 +620,19 @@ where
     where
         A: for<'a> Air<DebugConstraintBuilder<'a, Val<SC>, SC::Challenge>>,
     {
+        // Generate dependencies.
+        self.machine().generate_dependencies(&mut records, &opts, None);
+
         // Observe the preprocessed commitment.
         pk.observe_into(challenger);
 
-        let contains_global_bus = self.machine().contains_global_bus();
-
-        if contains_global_bus {
-            // Generate dependencies.
-            self.machine().generate_dependencies(&mut records, &opts, None);
-        }
-
-        // Generate and commit the global traces for each shard.
-        let global_data = records
-            .par_iter()
-            .map(|record| {
-                if contains_global_bus {
-                    let global_named_traces =
-                        self.generate_traces(record, InteractionScope::Global);
-                    Some(self.commit(record, global_named_traces))
-                } else {
-                    None
-                }
-            })
-            .collect::<Vec<_>>();
-
-        // Observe the challenges for each segment.
-        tracing::debug_span!("observing all challenges").in_scope(|| {
-            global_data.iter().zip_eq(records.iter()).for_each(|(global_data, record)| {
-                if contains_global_bus {
-                    challenger.observe(
-                        global_data
-                            .as_ref()
-                            .expect("must have a global commitment")
-                            .main_commit
-                            .clone(),
-                    );
-                }
-                challenger.observe_slice(&record.public_values::<SC::Val>()[0..self.num_pv_elts()]);
-            });
-        });
-
-        // Obtain the challenges used for the global permutation argument.
-        let global_permutation_challenges: [SC::Challenge; 2] = array::from_fn(|_| {
-            if contains_global_bus {
-                challenger.sample_ext_element()
-            } else {
-                SC::Challenge::zero()
-            }
-        });
-
         let shard_proofs = tracing::info_span!("prove_shards").in_scope(|| {
-            global_data
+            records
                 .into_par_iter()
-                .zip_eq(records.par_iter())
-                .map(|(global_shard_data, record)| {
-                    let local_named_traces = self.generate_traces(record, InteractionScope::Local);
-                    let local_shard_data = self.commit(record, local_named_traces);
-                    self.open(
-                        pk,
-                        global_shard_data,
-                        local_shard_data,
-                        &mut challenger.clone(),
-                        &global_permutation_challenges,
-                    )
+                .map(|record| {
+                    let named_traces = self.generate_traces(&record);
+                    let shard_data = self.commit(&record, named_traces);
+                    self.open(pk, shard_data, &mut challenger.clone())
                 })
                 .collect::<Result<Vec<_>, _>>()
         })?;
@@ -897,13 +655,17 @@ where
         self.pc_start
     }
 
+    fn initial_global_cumulative_sum(&self) -> SepticDigest<Val<SC>> {
+        self.initial_global_cumulative_sum
+    }
+
     fn observe_into(&self, challenger: &mut Challenger<SC>) {
         challenger.observe(self.commit.clone());
         challenger.observe(self.pc_start);
+        challenger.observe_slice(&self.initial_global_cumulative_sum.0.x.0);
+        challenger.observe_slice(&self.initial_global_cumulative_sum.0.y.0);
         let zero = Val::<SC>::zero();
-        for _ in 0..7 {
-            challenger.observe(zero);
-        }
+        challenger.observe(zero);
     }
 }
 
diff --git a/crates/stark/src/quotient.rs b/crates/stark/src/quotient.rs
index ee98d5e7c2..8d014b77e7 100644
--- a/crates/stark/src/quotient.rs
+++ b/crates/stark/src/quotient.rs
@@ -5,7 +5,7 @@ use p3_matrix::{dense::RowMajorMatrixView, stack::VerticalPair, Matrix};
 use p3_maybe_rayon::prelude::*;
 use p3_util::log2_strict_usize;
 
-use crate::air::MachineAir;
+use crate::{air::MachineAir, septic_digest::SepticDigest};
 
 use super::{
     folder::ProverConstraintFolder, Chip, Domain, PackedChallenge, PackedVal, StarkGenericConfig,
@@ -18,7 +18,8 @@ use super::{
 #[allow(clippy::too_many_lines)]
 pub fn quotient_values<SC, A, Mat>(
     chip: &Chip<Val<SC>, A>,
-    cumulative_sums: &[SC::Challenge],
+    local_cumulative_sum: &SC::Challenge,
+    global_cumulative_sum: &SepticDigest<Val<SC>>,
     trace_domain: Domain<SC>,
     quotient_domain: Domain<SC>,
     preprocessed_trace_on_quotient_domain: Option<Mat>,
@@ -127,10 +128,7 @@ where
 
             let accumulator = PackedChallenge::<SC>::zero();
 
-            let packed_cumulative_sums = cumulative_sums
-                .iter()
-                .map(|c| PackedChallenge::<SC>::from_f(*c))
-                .collect::<Vec<_>>();
+            let packed_local_cumulative_sum = PackedChallenge::<SC>::from_f(*local_cumulative_sum);
 
             let mut folder = ProverConstraintFolder {
                 preprocessed: VerticalPair::new(
@@ -146,7 +144,8 @@ where
                     RowMajorMatrixView::new_row(&perm_next),
                 ),
                 perm_challenges,
-                cumulative_sums: &packed_cumulative_sums,
+                local_cumulative_sum: &packed_local_cumulative_sum,
+                global_cumulative_sum,
                 is_first_row,
                 is_last_row,
                 is_transition,
diff --git a/crates/stark/src/septic_curve.rs b/crates/stark/src/septic_curve.rs
new file mode 100644
index 0000000000..b1350dff81
--- /dev/null
+++ b/crates/stark/src/septic_curve.rs
@@ -0,0 +1,346 @@
+//! Elliptic Curve `y^2 = x^3 + 2x + 26z^5` over the `F_{p^7} = F_p[z]/(z^7 - 2z - 5)` extension field.
+use crate::septic_extension::SepticExtension;
+use p3_field::{AbstractExtensionField, AbstractField, Field, PrimeField};
+use serde::{Deserialize, Serialize};
+use std::ops::Add;
+
+/// A septic elliptic curve point on y^2 = x^3 + 2x + 26z^5 over field `F_{p^7} = F_p[z]/(z^7 - 2z - 5)`.
+#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Hash)]
+#[repr(C)]
+pub struct SepticCurve<F> {
+    /// The x-coordinate of an elliptic curve point.
+    pub x: SepticExtension<F>,
+    /// The y-coordinate of an elliptic curve point.
+    pub y: SepticExtension<F>,
+}
+
+/// Linear coefficient for pairwise independent hash, derived from digits of pi.
+pub const A_EC_LOGUP: [u32; 7] =
+    [0x31415926, 0x53589793, 0x23846264, 0x33832795, 0x02884197, 0x16939937, 0x51058209];
+
+/// Constant coefficient for pairwise independent hash, derived from digits of pi.
+pub const B_EC_LOGUP: [u32; 7] =
+    [0x74944592, 0x30781640, 0x62862089, 0x9862803, 0x48253421, 0x17067982, 0x14808651];
+
+/// The x-coordinate for a curve point used as a witness for padding interactions.
+pub const CURVE_WITNESS_DUMMY_POINT_X: [u32; 7] =
+    [0x2738281, 0x8284590, 0x4523536, 0x0287471, 0x3526624, 0x9775724, 0x7093699];
+
+/// The y-coordinate for a curve point used as a witness for padding interactions.
+pub const CURVE_WITNESS_DUMMY_POINT_Y: [u32; 7] =
+    [48041908, 550064556, 415267377, 1726976249, 1253299140, 209439863, 1302309485];
+
+impl<F: Field> SepticCurve<F> {
+    /// Returns the dummy point.
+    #[must_use]
+    pub fn dummy() -> Self {
+        Self {
+            x: SepticExtension::from_base_fn(|i| {
+                F::from_canonical_u32(CURVE_WITNESS_DUMMY_POINT_X[i])
+            }),
+            y: SepticExtension::from_base_fn(|i| {
+                F::from_canonical_u32(CURVE_WITNESS_DUMMY_POINT_Y[i])
+            }),
+        }
+    }
+
+    /// Check if a `SepticCurve` struct is on the elliptic curve.
+    pub fn check_on_point(&self) -> bool {
+        self.y.square() == Self::curve_formula(self.x)
+    }
+
+    /// Negates a `SepticCurve` point.
+    #[must_use]
+    pub fn neg(&self) -> Self {
+        SepticCurve { x: self.x, y: -self.y }
+    }
+
+    #[must_use]
+    /// Adds two elliptic curve points, assuming that the addition doesn't lead to the exception cases of weierstrass addition.
+    pub fn add_incomplete(&self, other: SepticCurve<F>) -> Self {
+        let slope = (other.y - self.y) / (other.x - self.x);
+        let result_x = slope.square() - self.x - other.x;
+        let result_y = slope * (self.x - result_x) - self.y;
+        Self { x: result_x, y: result_y }
+    }
+
+    /// Add assigns an elliptic curve point, assuming that the addition doesn't lead to the exception cases of weierstrass addition.
+    pub fn add_assign(&mut self, other: SepticCurve<F>) {
+        let result = self.add_incomplete(other);
+        self.x = result.x;
+        self.y = result.y;
+    }
+
+    #[must_use]
+    /// Double the elliptic curve point.
+    pub fn double(&self) -> Self {
+        let slope = (self.x * self.x * F::from_canonical_u8(3u8) + F::two()) / (self.y * F::two());
+        let result_x = slope.square() - self.x * F::two();
+        let result_y = slope * (self.x - result_x) - self.y;
+        Self { x: result_x, y: result_y }
+    }
+
+    /// Subtracts two elliptic curve points, assuming that the subtraction doesn't lead to the exception cases of weierstrass addition.
+    #[must_use]
+    pub fn sub_incomplete(&self, other: SepticCurve<F>) -> Self {
+        self.add_incomplete(other.neg())
+    }
+
+    /// Subtract assigns an elliptic curve point, assuming that the subtraction doesn't lead to the exception cases of weierstrass addition.
+    pub fn sub_assign(&mut self, other: SepticCurve<F>) {
+        let result = self.add_incomplete(other.neg());
+        self.x = result.x;
+        self.y = result.y;
+    }
+}
+
+impl<F: AbstractField> SepticCurve<F> {
+    /// Convert a message into an x-coordinate by a pairwise independent hash `am + b`.
+    pub fn universal_hash(m: SepticExtension<F>) -> SepticExtension<F> {
+        let a_ec_logup =
+            SepticExtension::<F>::from_base_fn(|i| F::from_canonical_u32(A_EC_LOGUP[i]));
+        let b_ec_logup =
+            SepticExtension::<F>::from_base_fn(|i| F::from_canonical_u32(B_EC_LOGUP[i]));
+        a_ec_logup * m + b_ec_logup
+    }
+
+    /// Evaluates the curve formula x^3 + 2x + 26z^5
+    pub fn curve_formula(x: SepticExtension<F>) -> SepticExtension<F> {
+        x.cube()
+            + x * F::two()
+            + SepticExtension::from_base_slice(&[
+                F::zero(),
+                F::zero(),
+                F::zero(),
+                F::zero(),
+                F::zero(),
+                F::from_canonical_u32(26),
+                F::zero(),
+            ])
+    }
+}
+
+impl<F: PrimeField> SepticCurve<F> {
+    /// Lift an x coordinate into an elliptic curve.
+    /// As an x-coordinate may not be a valid one, we allow additions of [0, 256) * 2^16 to the first entry of the x-coordinate.
+    /// Also, we always return the curve point with y-coordinate within [0, (p-1)/2), where p is the characteristic.
+    /// The returned values are the curve point and the offset used.
+    pub fn lift_x(m: SepticExtension<F>) -> (Self, u8) {
+        for offset in 0..=255 {
+            let m_trial =
+                m + SepticExtension::from_base(F::from_canonical_u32((offset as u32) << 16));
+            let x_trial = Self::universal_hash(m_trial);
+            let y_sq = Self::curve_formula(x_trial);
+            if let Some(y) = y_sq.sqrt() {
+                if y.is_exception() {
+                    continue;
+                }
+                if y.is_send() {
+                    return (Self { x: x_trial, y: -y }, offset);
+                }
+                return (Self { x: x_trial, y }, offset);
+            }
+        }
+        panic!("curve point couldn't be found after 256 attempts");
+    }
+}
+
+impl<F: AbstractField> SepticCurve<F> {
+    /// Given three points p1, p2, p3, the function is zero if and only if p3.x == (p1 + p2).x assuming that p1 != p2.
+    pub fn sum_checker_x(
+        p1: SepticCurve<F>,
+        p2: SepticCurve<F>,
+        p3: SepticCurve<F>,
+    ) -> SepticExtension<F> {
+        (p1.x.clone() + p2.x.clone() + p3.x) * (p2.x.clone() - p1.x.clone()).square()
+            - (p2.y - p1.y).square()
+    }
+
+    /// Given three points p1, p2, p3, the function is zero if and only if p3.y == (p1 + p2).y assuming that p1 != p2.
+    pub fn sum_checker_y(
+        p1: SepticCurve<F>,
+        p2: SepticCurve<F>,
+        p3: SepticCurve<F>,
+    ) -> SepticExtension<F> {
+        (p1.y.clone() + p3.y.clone()) * (p2.x.clone() - p1.x.clone())
+            - (p2.y - p1.y.clone()) * (p1.x - p3.x)
+    }
+}
+
+impl<T> SepticCurve<T> {
+    /// Convert a `SepticCurve<S>` into `SepticCurve<T>`, with a map that implements `FnMut(S) -> T`.
+    pub fn convert<S: Copy, G: FnMut(S) -> T>(point: SepticCurve<S>, mut f: G) -> Self {
+        SepticCurve {
+            x: SepticExtension(point.x.0.map(&mut f)),
+            y: SepticExtension(point.y.0.map(&mut f)),
+        }
+    }
+}
+
+/// A septic elliptic curve point on y^2 = x^3 + 2x + 26z^5 over field `F_{p^7} = F_p[z]/(z^7 - 2z - 5)`, including the point at infinity.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub enum SepticCurveComplete<T> {
+    /// The point at infinity.
+    Infinity,
+    /// The affine point which can be represented with a `SepticCurve<T>` structure.
+    Affine(SepticCurve<T>),
+}
+
+impl<F: Field> Add for SepticCurveComplete<F> {
+    type Output = Self;
+    fn add(self, rhs: Self) -> Self::Output {
+        if self.is_infinity() {
+            return rhs;
+        }
+        if rhs.is_infinity() {
+            return self;
+        }
+        let point1 = self.point();
+        let point2 = rhs.point();
+        if point1.x != point2.x {
+            return Self::Affine(point1.add_incomplete(point2));
+        }
+        if point1.y == point2.y {
+            return Self::Affine(point1.double());
+        }
+        Self::Infinity
+    }
+}
+
+impl<F: Field> SepticCurveComplete<F> {
+    /// Returns whether or not the point is a point at infinity.
+    pub fn is_infinity(&self) -> bool {
+        match self {
+            Self::Infinity => true,
+            Self::Affine(_) => false,
+        }
+    }
+
+    /// Asserts that the point is not a point at infinity, and returns the `SepticCurve` value.
+    pub fn point(&self) -> SepticCurve<F> {
+        match self {
+            Self::Infinity => panic!("point() called for point at infinity"),
+            Self::Affine(point) => *point,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_baby_bear::BabyBear;
+    use p3_maybe_rayon::prelude::ParallelIterator;
+    use p3_maybe_rayon::prelude::{IndexedParallelIterator, IntoParallelIterator};
+    use rayon_scan::ScanParallelIterator;
+    use std::time::Instant;
+
+    use super::*;
+
+    #[test]
+    fn test_lift_x() {
+        let x: SepticExtension<BabyBear> = SepticExtension::from_base_slice(&[
+            BabyBear::from_canonical_u32(0x2013),
+            BabyBear::from_canonical_u32(0x2015),
+            BabyBear::from_canonical_u32(0x2016),
+            BabyBear::from_canonical_u32(0x2023),
+            BabyBear::from_canonical_u32(0x2024),
+            BabyBear::from_canonical_u32(0x2016),
+            BabyBear::from_canonical_u32(0x2017),
+        ]);
+        let (curve_point, _) = SepticCurve::<BabyBear>::lift_x(x);
+        assert!(curve_point.check_on_point());
+        assert!(curve_point.x.is_receive());
+    }
+
+    #[test]
+    fn test_double() {
+        let x: SepticExtension<BabyBear> = SepticExtension::from_base_slice(&[
+            BabyBear::from_canonical_u32(0x2013),
+            BabyBear::from_canonical_u32(0x2015),
+            BabyBear::from_canonical_u32(0x2016),
+            BabyBear::from_canonical_u32(0x2023),
+            BabyBear::from_canonical_u32(0x2024),
+            BabyBear::from_canonical_u32(0x2016),
+            BabyBear::from_canonical_u32(0x2017),
+        ]);
+        let (curve_point, _) = SepticCurve::<BabyBear>::lift_x(x);
+        let double_point = curve_point.double();
+        assert!(double_point.check_on_point());
+    }
+
+    #[test]
+    #[ignore]
+    fn test_simple_bench() {
+        const D: u32 = 1 << 16;
+        let mut vec = Vec::with_capacity(D as usize);
+        let mut sum = Vec::with_capacity(D as usize);
+        let start = Instant::now();
+        for i in 0..D {
+            let x: SepticExtension<BabyBear> = SepticExtension::from_base_slice(&[
+                BabyBear::from_canonical_u32(i + 25),
+                BabyBear::from_canonical_u32(2 * i + 376),
+                BabyBear::from_canonical_u32(4 * i + 23),
+                BabyBear::from_canonical_u32(8 * i + 531),
+                BabyBear::from_canonical_u32(16 * i + 542),
+                BabyBear::from_canonical_u32(32 * i + 196),
+                BabyBear::from_canonical_u32(64 * i + 667),
+            ]);
+            let (curve_point, _) = SepticCurve::<BabyBear>::lift_x(x);
+            vec.push(curve_point);
+        }
+        println!("Time elapsed: {:?}", start.elapsed());
+        let start = Instant::now();
+        for i in 0..D {
+            sum.push(vec[i as usize].add_incomplete(vec[((i + 1) % D) as usize]));
+        }
+        println!("Time elapsed: {:?}", start.elapsed());
+        let start = Instant::now();
+        for i in 0..(D as usize) {
+            assert!(
+                SepticCurve::<BabyBear>::sum_checker_x(vec[i], vec[(i + 1) % D as usize], sum[i])
+                    == SepticExtension::<BabyBear>::zero()
+            );
+            assert!(
+                SepticCurve::<BabyBear>::sum_checker_y(vec[i], vec[(i + 1) % D as usize], sum[i])
+                    == SepticExtension::<BabyBear>::zero()
+            );
+        }
+        println!("Time elapsed: {:?}", start.elapsed());
+    }
+
+    #[test]
+    #[ignore]
+    fn test_parallel_bench() {
+        const D: u32 = 1 << 20;
+        let mut vec = Vec::with_capacity(D as usize);
+        let start = Instant::now();
+        for i in 0..D {
+            let x: SepticExtension<BabyBear> = SepticExtension::from_base_slice(&[
+                BabyBear::from_canonical_u32(i + 25),
+                BabyBear::from_canonical_u32(2 * i + 376),
+                BabyBear::from_canonical_u32(4 * i + 23),
+                BabyBear::from_canonical_u32(8 * i + 531),
+                BabyBear::from_canonical_u32(16 * i + 542),
+                BabyBear::from_canonical_u32(32 * i + 196),
+                BabyBear::from_canonical_u32(64 * i + 667),
+            ]);
+            let (curve_point, _) = SepticCurve::<BabyBear>::lift_x(x);
+            vec.push(SepticCurveComplete::Affine(curve_point));
+        }
+        println!("Time elapsed: {:?}", start.elapsed());
+
+        let mut cum_sum = SepticCurveComplete::Infinity;
+        let start = Instant::now();
+        for point in &vec {
+            cum_sum = cum_sum + *point;
+        }
+        println!("Time elapsed: {:?}", start.elapsed());
+        let start = Instant::now();
+        let par_sum = vec
+            .into_par_iter()
+            .with_min_len(1 << 16)
+            .scan(|a, b| *a + *b, SepticCurveComplete::Infinity)
+            .collect::<Vec<SepticCurveComplete<BabyBear>>>();
+        println!("Time elapsed: {:?}", start.elapsed());
+        assert_eq!(cum_sum, *par_sum.last().unwrap());
+    }
+}
diff --git a/crates/stark/src/septic_digest.rs b/crates/stark/src/septic_digest.rs
new file mode 100644
index 0000000000..579c5cb9bb
--- /dev/null
+++ b/crates/stark/src/septic_digest.rs
@@ -0,0 +1,98 @@
+//! Elliptic Curve digests with a starting point to avoid weierstrass addition exceptions.
+use crate::septic_curve::SepticCurve;
+use crate::septic_extension::SepticExtension;
+use p3_field::{AbstractExtensionField, AbstractField, Field};
+use serde::{Deserialize, Serialize};
+use std::iter::Sum;
+
+/// The x-coordinate for a curve point used as a starting cumulative sum for global permutation trace generation.
+pub const CURVE_CUMULATIVE_SUM_START_X: [u32; 7] =
+    [0x1434213, 0x5623730, 0x9504880, 0x1688724, 0x2096980, 0x7856967, 0x1875376];
+
+/// The y-coordinate for a curve point used as a starting cumulative sum for global permutation trace generation.
+pub const CURVE_CUMULATIVE_SUM_START_Y: [u32; 7] =
+    [885797405, 1130275556, 567836311, 52700240, 239639200, 442612155, 1839439733];
+
+/// The x-coordinate for a curve point used as a starting random point for digest accumulation.
+pub const DIGEST_SUM_START_X: [u32; 7] =
+    [0x1742050, 0x8075688, 0x7729352, 0x7446341, 0x5058723, 0x6694280, 0x5253810];
+
+/// The y-coordinate for a curve point used as a starting random point for digest accumulation.
+pub const DIGEST_SUM_START_Y: [u32; 7] =
+    [462194069, 1842131493, 281651264, 1684885851, 483907222, 1097389352, 1648978901];
+
+/// A global cumulative sum digest, a point on the elliptic curve that `SepticCurve<F>` represents.
+/// As these digests start with the `CURVE_CUMULATIVE_SUM_START` point, they require special summing logic.
+#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Hash)]
+#[repr(C)]
+pub struct SepticDigest<F>(pub SepticCurve<F>);
+
+impl<F: AbstractField> SepticDigest<F> {
+    #[must_use]
+    /// The zero digest, the starting point of the accumulation of curve points derived from the scheme.
+    pub fn zero() -> Self {
+        SepticDigest(SepticCurve {
+            x: SepticExtension::<F>::from_base_fn(|i| {
+                F::from_canonical_u32(CURVE_CUMULATIVE_SUM_START_X[i])
+            }),
+            y: SepticExtension::<F>::from_base_fn(|i| {
+                F::from_canonical_u32(CURVE_CUMULATIVE_SUM_START_Y[i])
+            }),
+        })
+    }
+
+    #[must_use]
+    /// The digest used for starting the accumulation of digests.
+    pub fn starting_digest() -> Self {
+        SepticDigest(SepticCurve {
+            x: SepticExtension::<F>::from_base_fn(|i| F::from_canonical_u32(DIGEST_SUM_START_X[i])),
+            y: SepticExtension::<F>::from_base_fn(|i| F::from_canonical_u32(DIGEST_SUM_START_Y[i])),
+        })
+    }
+}
+
+impl<F: Field> SepticDigest<F> {
+    /// Checks that the digest is zero, the starting point of the accumulation.
+    pub fn is_zero(&self) -> bool {
+        *self == SepticDigest::<F>::zero()
+    }
+}
+
+impl<F: Field> Sum for SepticDigest<F> {
+    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+        let start = SepticDigest::<F>::starting_digest().0;
+
+        // Computation order is start + (digest1 - offset) + (digest2 - offset) + ... + (digestN - offset) + offset - start.
+        let mut ret = iter.fold(start, |acc, x| {
+            let sum_offset = acc.add_incomplete(x.0);
+            sum_offset.sub_incomplete(SepticDigest::<F>::zero().0)
+        });
+
+        ret.add_assign(SepticDigest::<F>::zero().0);
+        ret.sub_assign(start);
+        SepticDigest(ret)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use p3_baby_bear::BabyBear;
+    #[test]
+    fn test_const_points() {
+        let x: SepticExtension<BabyBear> = SepticExtension::from_base_fn(|i| {
+            BabyBear::from_canonical_u32(CURVE_CUMULATIVE_SUM_START_X[i])
+        });
+        let y: SepticExtension<BabyBear> = SepticExtension::from_base_fn(|i| {
+            BabyBear::from_canonical_u32(CURVE_CUMULATIVE_SUM_START_Y[i])
+        });
+        let point = SepticCurve { x, y };
+        assert!(point.check_on_point());
+        let x: SepticExtension<BabyBear> =
+            SepticExtension::from_base_fn(|i| BabyBear::from_canonical_u32(DIGEST_SUM_START_X[i]));
+        let y: SepticExtension<BabyBear> =
+            SepticExtension::from_base_fn(|i| BabyBear::from_canonical_u32(DIGEST_SUM_START_Y[i]));
+        let point = SepticCurve { x, y };
+        assert!(point.check_on_point());
+    }
+}
diff --git a/crates/stark/src/septic_extension.rs b/crates/stark/src/septic_extension.rs
new file mode 100644
index 0000000000..651e3c7fa0
--- /dev/null
+++ b/crates/stark/src/septic_extension.rs
@@ -0,0 +1,883 @@
+//! A septic extension with an irreducible polynomial `z^7 - 2z - 5`.
+use num_bigint::BigUint;
+use num_traits::One;
+use p3_field::PrimeField;
+use p3_field::{AbstractExtensionField, AbstractField, ExtensionField, Field, Packable};
+use serde::{Deserialize, Serialize};
+use std::array;
+use std::fmt::Display;
+use std::iter::{Product, Sum};
+use std::ops::{Add, AddAssign, Div, Index, IndexMut, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use crate::air::{SP1AirBuilder, SepticExtensionAirBuilder};
+
+/// A septic extension with an irreducible polynomial `z^7 - 2z - 5`.
+///
+/// The field can be constructed as `F_{p^7} = F_p[z]/(z^7 - 2z - 5)`.
+#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Hash)]
+#[repr(C)]
+pub struct SepticExtension<F>(pub [F; 7]);
+
+impl<F: AbstractField> AbstractField for SepticExtension<F> {
+    type F = SepticExtension<F::F>;
+
+    fn zero() -> Self {
+        SepticExtension([
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn one() -> Self {
+        SepticExtension([
+            F::one(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn two() -> Self {
+        SepticExtension([
+            F::two(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn neg_one() -> Self {
+        SepticExtension([
+            F::neg_one(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn from_f(f: Self::F) -> Self {
+        SepticExtension([
+            F::from_f(f.0[0]),
+            F::from_f(f.0[1]),
+            F::from_f(f.0[2]),
+            F::from_f(f.0[3]),
+            F::from_f(f.0[4]),
+            F::from_f(f.0[5]),
+            F::from_f(f.0[6]),
+        ])
+    }
+
+    fn from_bool(b: bool) -> Self {
+        SepticExtension([
+            F::from_bool(b),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn from_canonical_u8(n: u8) -> Self {
+        SepticExtension([
+            F::from_canonical_u8(n),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn from_canonical_u16(n: u16) -> Self {
+        SepticExtension([
+            F::from_canonical_u16(n),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn from_canonical_u32(n: u32) -> Self {
+        SepticExtension([
+            F::from_canonical_u32(n),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn from_canonical_u64(n: u64) -> Self {
+        SepticExtension([
+            F::from_canonical_u64(n),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn from_canonical_usize(n: usize) -> Self {
+        SepticExtension([
+            F::from_canonical_usize(n),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn from_wrapped_u32(n: u32) -> Self {
+        SepticExtension([
+            F::from_wrapped_u32(n),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn from_wrapped_u64(n: u64) -> Self {
+        SepticExtension([
+            F::from_wrapped_u64(n),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+            F::zero(),
+        ])
+    }
+
+    fn generator() -> Self {
+        SepticExtension([F::two(), F::one(), F::zero(), F::zero(), F::zero(), F::zero(), F::zero()])
+    }
+}
+
+impl<F: Field> Field for SepticExtension<F> {
+    type Packing = Self;
+
+    fn try_inverse(&self) -> Option<Self> {
+        if self.is_zero() {
+            return None;
+        }
+        Some(self.inv())
+    }
+
+    fn order() -> BigUint {
+        F::order().pow(7)
+    }
+}
+
+impl<F: AbstractField> AbstractExtensionField<F> for SepticExtension<F> {
+    const D: usize = 7;
+
+    fn from_base(b: F) -> Self {
+        SepticExtension([b, F::zero(), F::zero(), F::zero(), F::zero(), F::zero(), F::zero()])
+    }
+
+    fn from_base_slice(bs: &[F]) -> Self {
+        SepticExtension([
+            bs[0].clone(),
+            bs[1].clone(),
+            bs[2].clone(),
+            bs[3].clone(),
+            bs[4].clone(),
+            bs[5].clone(),
+            bs[6].clone(),
+        ])
+    }
+
+    fn from_base_fn<G: FnMut(usize) -> F>(f: G) -> Self {
+        Self(array::from_fn(f))
+    }
+
+    fn as_base_slice(&self) -> &[F] {
+        self.0.as_slice()
+    }
+}
+
+impl<F: Field> ExtensionField<F> for SepticExtension<F> {
+    type ExtensionPacking = SepticExtension<F::Packing>;
+}
+
+impl<F: Field> Packable for SepticExtension<F> {}
+
+impl<F: AbstractField> Add for SepticExtension<F> {
+    type Output = Self;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        let mut res = self.0;
+        for (r, rhs_val) in res.iter_mut().zip(rhs.0) {
+            *r = (*r).clone() + rhs_val;
+        }
+        Self(res)
+    }
+}
+
+impl<F: AbstractField> AddAssign for SepticExtension<F> {
+    fn add_assign(&mut self, rhs: Self) {
+        self.0[0] += rhs.0[0].clone();
+        self.0[1] += rhs.0[1].clone();
+        self.0[2] += rhs.0[2].clone();
+        self.0[3] += rhs.0[3].clone();
+        self.0[4] += rhs.0[4].clone();
+        self.0[5] += rhs.0[5].clone();
+        self.0[6] += rhs.0[6].clone();
+    }
+}
+
+impl<F: AbstractField> Sub for SepticExtension<F> {
+    type Output = Self;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        let mut res = self.0;
+        for (r, rhs_val) in res.iter_mut().zip(rhs.0) {
+            *r = (*r).clone() - rhs_val;
+        }
+        Self(res)
+    }
+}
+
+impl<F: AbstractField> SubAssign for SepticExtension<F> {
+    fn sub_assign(&mut self, rhs: Self) {
+        self.0[0] -= rhs.0[0].clone();
+    }
+}
+
+impl<F: AbstractField> Neg for SepticExtension<F> {
+    type Output = Self;
+
+    fn neg(self) -> Self::Output {
+        let mut res = self.0;
+        for r in res.iter_mut() {
+            *r = -r.clone();
+        }
+        Self(res)
+    }
+}
+
+impl<F: AbstractField> Mul for SepticExtension<F> {
+    type Output = Self;
+
+    fn mul(self, rhs: Self) -> Self::Output {
+        let mut res: [F; 13] = core::array::from_fn(|_| F::zero());
+        for i in 0..7 {
+            for j in 0..7 {
+                res[i + j] = res[i + j].clone() + self.0[i].clone() * rhs.0[j].clone();
+            }
+        }
+        let mut ret: [F; 7] = core::array::from_fn(|i| res[i].clone());
+        for i in 7..13 {
+            ret[i - 7] = ret[i - 7].clone() + res[i].clone() * F::from_canonical_u32(5);
+            ret[i - 6] = ret[i - 6].clone() + res[i].clone() * F::from_canonical_u32(2);
+        }
+        Self(ret)
+    }
+}
+
+impl<F: AbstractField> MulAssign for SepticExtension<F> {
+    fn mul_assign(&mut self, rhs: Self) {
+        let res = self.clone() * rhs;
+        *self = res;
+    }
+}
+
+impl<F: AbstractField> Product for SepticExtension<F> {
+    fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
+        let one = Self::one();
+        iter.fold(one, |acc, x| acc * x)
+    }
+}
+
+impl<F: AbstractField> Sum for SepticExtension<F> {
+    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+        let zero = Self::zero();
+        iter.fold(zero, |acc, x| acc + x)
+    }
+}
+
+impl<F: AbstractField> From<F> for SepticExtension<F> {
+    fn from(f: F) -> Self {
+        SepticExtension([f, F::zero(), F::zero(), F::zero(), F::zero(), F::zero(), F::zero()])
+    }
+}
+
+impl<F: AbstractField> Add<F> for SepticExtension<F> {
+    type Output = Self;
+
+    fn add(self, rhs: F) -> Self::Output {
+        SepticExtension([
+            self.0[0].clone() + rhs,
+            self.0[1].clone(),
+            self.0[2].clone(),
+            self.0[3].clone(),
+            self.0[4].clone(),
+            self.0[5].clone(),
+            self.0[6].clone(),
+        ])
+    }
+}
+
+impl<F: AbstractField> AddAssign<F> for SepticExtension<F> {
+    fn add_assign(&mut self, rhs: F) {
+        self.0[0] += rhs;
+    }
+}
+
+impl<F: AbstractField> Sub<F> for SepticExtension<F> {
+    type Output = Self;
+
+    fn sub(self, rhs: F) -> Self::Output {
+        self + (-rhs)
+    }
+}
+
+impl<F: AbstractField> SubAssign<F> for SepticExtension<F> {
+    fn sub_assign(&mut self, rhs: F) {
+        self.0[0] -= rhs;
+    }
+}
+
+impl<F: AbstractField> Mul<F> for SepticExtension<F> {
+    type Output = Self;
+
+    fn mul(self, rhs: F) -> Self::Output {
+        SepticExtension([
+            self.0[0].clone() * rhs.clone(),
+            self.0[1].clone() * rhs.clone(),
+            self.0[2].clone() * rhs.clone(),
+            self.0[3].clone() * rhs.clone(),
+            self.0[4].clone() * rhs.clone(),
+            self.0[5].clone() * rhs.clone(),
+            self.0[6].clone() * rhs.clone(),
+        ])
+    }
+}
+
+impl<F: AbstractField> MulAssign<F> for SepticExtension<F> {
+    fn mul_assign(&mut self, rhs: F) {
+        for i in 0..7 {
+            self.0[i] *= rhs.clone();
+        }
+    }
+}
+
+impl<F: Field> Div for SepticExtension<F> {
+    type Output = Self;
+
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn div(self, rhs: Self) -> Self::Output {
+        self * rhs.inverse()
+    }
+}
+
+impl<F: AbstractField> Display for SepticExtension<F> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self.0)
+    }
+}
+
+impl<F: Field> SepticExtension<F> {
+    fn z_pow_p(index: u32) -> Self {
+        // The constants written below are specifically for the BabyBear field.
+        debug_assert_eq!(F::order(), BigUint::from(2013265921u32));
+        if index == 0 {
+            return Self::one();
+        }
+        if index == 1 {
+            return SepticExtension([
+                F::from_canonical_u32(954599710),
+                F::from_canonical_u32(1359279693),
+                F::from_canonical_u32(566669999),
+                F::from_canonical_u32(1982781815),
+                F::from_canonical_u32(1735718361),
+                F::from_canonical_u32(1174868538),
+                F::from_canonical_u32(1120871770),
+            ]);
+        }
+        if index == 2 {
+            return SepticExtension([
+                F::from_canonical_u32(862825265),
+                F::from_canonical_u32(597046311),
+                F::from_canonical_u32(978840770),
+                F::from_canonical_u32(1790138282),
+                F::from_canonical_u32(1044777201),
+                F::from_canonical_u32(835869808),
+                F::from_canonical_u32(1342179023),
+            ]);
+        }
+        if index == 3 {
+            return SepticExtension([
+                F::from_canonical_u32(596273169),
+                F::from_canonical_u32(658837454),
+                F::from_canonical_u32(1515468261),
+                F::from_canonical_u32(367059247),
+                F::from_canonical_u32(781278880),
+                F::from_canonical_u32(1544222616),
+                F::from_canonical_u32(155490465),
+            ]);
+        }
+        if index == 4 {
+            return SepticExtension([
+                F::from_canonical_u32(557608863),
+                F::from_canonical_u32(1173670028),
+                F::from_canonical_u32(1749546888),
+                F::from_canonical_u32(1086464137),
+                F::from_canonical_u32(803900099),
+                F::from_canonical_u32(1288818584),
+                F::from_canonical_u32(1184677604),
+            ]);
+        }
+        if index == 5 {
+            return SepticExtension([
+                F::from_canonical_u32(763416381),
+                F::from_canonical_u32(1252567168),
+                F::from_canonical_u32(628856225),
+                F::from_canonical_u32(1771903394),
+                F::from_canonical_u32(650712211),
+                F::from_canonical_u32(19417363),
+                F::from_canonical_u32(57990258),
+            ]);
+        }
+        if index == 6 {
+            return SepticExtension([
+                F::from_canonical_u32(1734711039),
+                F::from_canonical_u32(1749813853),
+                F::from_canonical_u32(1227235221),
+                F::from_canonical_u32(1707730636),
+                F::from_canonical_u32(424560395),
+                F::from_canonical_u32(1007029514),
+                F::from_canonical_u32(498034669),
+            ]);
+        }
+        unreachable!();
+    }
+
+    fn z_pow_p2(index: u32) -> Self {
+        // The constants written below are specifically for the BabyBear field.
+        debug_assert_eq!(F::order(), BigUint::from(2013265921u32));
+        if index == 0 {
+            return Self::one();
+        }
+        if index == 1 {
+            return SepticExtension([
+                F::from_canonical_u32(1013489358),
+                F::from_canonical_u32(1619071628),
+                F::from_canonical_u32(304593143),
+                F::from_canonical_u32(1949397349),
+                F::from_canonical_u32(1564307636),
+                F::from_canonical_u32(327761151),
+                F::from_canonical_u32(415430835),
+            ]);
+        }
+        if index == 2 {
+            return SepticExtension([
+                F::from_canonical_u32(209824426),
+                F::from_canonical_u32(1313900768),
+                F::from_canonical_u32(38410482),
+                F::from_canonical_u32(256593180),
+                F::from_canonical_u32(1708830551),
+                F::from_canonical_u32(1244995038),
+                F::from_canonical_u32(1555324019),
+            ]);
+        }
+        if index == 3 {
+            return SepticExtension([
+                F::from_canonical_u32(1475628651),
+                F::from_canonical_u32(777565847),
+                F::from_canonical_u32(704492386),
+                F::from_canonical_u32(1218528120),
+                F::from_canonical_u32(1245363405),
+                F::from_canonical_u32(475884575),
+                F::from_canonical_u32(649166061),
+            ]);
+        }
+        if index == 4 {
+            return SepticExtension([
+                F::from_canonical_u32(550038364),
+                F::from_canonical_u32(948935655),
+                F::from_canonical_u32(68722023),
+                F::from_canonical_u32(1251345762),
+                F::from_canonical_u32(1692456177),
+                F::from_canonical_u32(1177958698),
+                F::from_canonical_u32(350232928),
+            ]);
+        }
+        if index == 5 {
+            return SepticExtension([
+                F::from_canonical_u32(882720258),
+                F::from_canonical_u32(821925756),
+                F::from_canonical_u32(199955840),
+                F::from_canonical_u32(812002876),
+                F::from_canonical_u32(1484951277),
+                F::from_canonical_u32(1063138035),
+                F::from_canonical_u32(491712810),
+            ]);
+        }
+        if index == 6 {
+            return SepticExtension([
+                F::from_canonical_u32(738287111),
+                F::from_canonical_u32(1955364991),
+                F::from_canonical_u32(552724293),
+                F::from_canonical_u32(1175775744),
+                F::from_canonical_u32(341623997),
+                F::from_canonical_u32(1454022463),
+                F::from_canonical_u32(408193320),
+            ]);
+        }
+        unreachable!();
+    }
+
+    #[must_use]
+    fn frobenius(&self) -> Self {
+        let mut result = Self::zero();
+        result += self.0[0];
+        result += Self::z_pow_p(1) * self.0[1];
+        result += Self::z_pow_p(2) * self.0[2];
+        result += Self::z_pow_p(3) * self.0[3];
+        result += Self::z_pow_p(4) * self.0[4];
+        result += Self::z_pow_p(5) * self.0[5];
+        result += Self::z_pow_p(6) * self.0[6];
+        result
+    }
+
+    #[must_use]
+    fn double_frobenius(&self) -> Self {
+        let mut result = Self::zero();
+        result += self.0[0];
+        result += Self::z_pow_p2(1) * self.0[1];
+        result += Self::z_pow_p2(2) * self.0[2];
+        result += Self::z_pow_p2(3) * self.0[3];
+        result += Self::z_pow_p2(4) * self.0[4];
+        result += Self::z_pow_p2(5) * self.0[5];
+        result += Self::z_pow_p2(6) * self.0[6];
+        result
+    }
+
+    #[must_use]
+    fn pow_r_1(&self) -> Self {
+        let base = self.frobenius() * self.double_frobenius();
+        let base_p2 = base.double_frobenius();
+        let base_p4 = base_p2.double_frobenius();
+        base * base_p2 * base_p4
+    }
+
+    #[must_use]
+    fn inv(&self) -> Self {
+        let pow_r_1 = self.pow_r_1();
+        let pow_r = pow_r_1 * *self;
+        pow_r_1 * pow_r.0[0].inverse()
+    }
+
+    fn is_square(&self) -> (F, bool) {
+        let pow_r_1 = self.pow_r_1();
+        let pow_r = pow_r_1 * *self;
+        let exp = (F::order() - BigUint::one()) / BigUint::from(2u8);
+        let exp = exp.to_u64_digits()[0];
+
+        (pow_r.0[0], pow_r.0[0].exp_u64(exp) == F::one())
+    }
+
+    /// Computes the square root of the septic field extension element.
+    /// Returns None if the element is not a square, and Some(result) if it is a square.
+    pub fn sqrt(&self) -> Option<Self> {
+        let n = *self;
+
+        if n == Self::zero() || n == Self::one() {
+            return Some(n);
+        }
+
+        let (numerator, is_square) = n.is_square();
+
+        if !is_square {
+            return None;
+        }
+
+        let mut n_iter = n;
+        let mut n_power = n;
+        for i in 1..30 {
+            n_iter *= n_iter;
+            if i >= 26 {
+                n_power *= n_iter;
+            }
+        }
+
+        let mut n_frobenius = n_power.frobenius();
+        let mut denominator = n_frobenius;
+
+        n_frobenius = n_frobenius.double_frobenius();
+        denominator *= n_frobenius;
+        n_frobenius = n_frobenius.double_frobenius();
+        denominator *= n_frobenius;
+        denominator *= n;
+
+        let base = numerator.inverse();
+        let g = F::generator();
+        let mut a = F::one();
+        let mut nonresidue = F::one() - base;
+        let legendre_exp = (F::order() - BigUint::one()) / BigUint::from(2u8);
+
+        while nonresidue.exp_u64(legendre_exp.to_u64_digits()[0]) == F::one() {
+            a *= g;
+            nonresidue = a.square() - base;
+        }
+
+        let order = F::order();
+        let cipolla_pow = (&order + BigUint::one()) / BigUint::from(2u8);
+        let mut x = CipollaExtension::new(a, F::one());
+        x = x.pow(&cipolla_pow, nonresidue);
+
+        Some(denominator * x.real)
+    }
+}
+
+impl<F: PrimeField> SepticExtension<F> {
+    /// Returns whether the extension field element viewed as an y-coordinate of a digest represents a receive interaction.
+    pub fn is_receive(&self) -> bool {
+        BigUint::from(1u32) <= self.0[6].as_canonical_biguint()
+            && self.0[6].as_canonical_biguint()
+                <= (F::order() - BigUint::from(1u32)) / BigUint::from(2u32)
+    }
+
+    /// Returns whether the extension field element viewed as an y-coordinate of a digest represents a send interaction.
+    pub fn is_send(&self) -> bool {
+        (F::order() + BigUint::from(1u32)) / BigUint::from(2u32) <= self.0[6].as_canonical_biguint()
+            && self.0[6].as_canonical_biguint() <= (F::order() - BigUint::from(1u32))
+    }
+
+    /// Returns whether the extension field element viewed as an y-coordinate of a digest cannot represent anything.
+    pub fn is_exception(&self) -> bool {
+        self.0[6].as_canonical_biguint() == BigUint::from(0u32)
+    }
+}
+
+/// Extension field for Cipolla's algorithm, taken from <https://github.com/Plonky3/Plonky3/pull/439/files>.
+#[derive(Clone, Copy, Debug)]
+struct CipollaExtension<F: Field> {
+    real: F,
+    imag: F,
+}
+
+impl<F: Field> CipollaExtension<F> {
+    fn new(real: F, imag: F) -> Self {
+        Self { real, imag }
+    }
+
+    fn one() -> Self {
+        Self::new(F::one(), F::zero())
+    }
+
+    fn mul_ext(&self, other: Self, nonresidue: F) -> Self {
+        Self::new(
+            self.real * other.real + nonresidue * self.imag * other.imag,
+            self.real * other.imag + self.imag * other.real,
+        )
+    }
+
+    fn pow(&self, exp: &BigUint, nonresidue: F) -> Self {
+        let mut result = Self::one();
+        let mut base = *self;
+        let bits = exp.bits();
+
+        for i in 0..bits {
+            if exp.bit(i) {
+                result = result.mul_ext(base, nonresidue);
+            }
+            base = base.mul_ext(base, nonresidue);
+        }
+        result
+    }
+}
+
+/// A block of columns for septic extension.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[repr(C)]
+pub struct SepticBlock<T>(pub [T; 7]);
+
+impl<T> SepticBlock<T> {
+    /// Maps a `SepticBlock<T>` to `SepticBlock<U>` based on a map from `T` to `U`.
+    pub fn map<F, U>(self, f: F) -> SepticBlock<U>
+    where
+        F: FnMut(T) -> U,
+    {
+        SepticBlock(self.0.map(f))
+    }
+
+    /// A function similar to `core:array::from_fn`.
+    pub fn from_base_fn<G: FnMut(usize) -> T>(f: G) -> Self {
+        Self(array::from_fn(f))
+    }
+}
+
+impl<T: Clone> SepticBlock<T> {
+    /// Takes a `SepticBlock` into a `SepticExtension` of expressions.
+    pub fn as_extension<AB: SepticExtensionAirBuilder<Var = T>>(
+        &self,
+    ) -> SepticExtension<AB::Expr> {
+        let arr: [AB::Expr; 7] = self.0.clone().map(|x| AB::Expr::zero() + x);
+        SepticExtension(arr)
+    }
+
+    /// Takes a single expression into a `SepticExtension` of expressions.
+    pub fn as_extension_from_base<AB: SP1AirBuilder<Var = T>>(
+        &self,
+        base: AB::Expr,
+    ) -> SepticExtension<AB::Expr> {
+        let mut arr: [AB::Expr; 7] = self.0.clone().map(|_| AB::Expr::zero());
+        arr[0] = base;
+
+        SepticExtension(arr)
+    }
+}
+
+impl<T> From<[T; 7]> for SepticBlock<T> {
+    fn from(arr: [T; 7]) -> Self {
+        Self(arr)
+    }
+}
+
+impl<T: AbstractField> From<T> for SepticBlock<T> {
+    fn from(value: T) -> Self {
+        Self([value, T::zero(), T::zero(), T::zero(), T::zero(), T::zero(), T::zero()])
+    }
+}
+
+impl<T: Copy> From<&[T]> for SepticBlock<T> {
+    fn from(slice: &[T]) -> Self {
+        let arr: [T; 7] = slice.try_into().unwrap();
+        Self(arr)
+    }
+}
+
+impl<T, I> Index<I> for SepticBlock<T>
+where
+    [T]: Index<I>,
+{
+    type Output = <[T] as Index<I>>::Output;
+
+    #[inline]
+    fn index(&self, index: I) -> &Self::Output {
+        Index::index(&self.0, index)
+    }
+}
+
+impl<T, I> IndexMut<I> for SepticBlock<T>
+where
+    [T]: IndexMut<I>,
+{
+    #[inline]
+    fn index_mut(&mut self, index: I) -> &mut Self::Output {
+        IndexMut::index_mut(&mut self.0, index)
+    }
+}
+
+impl<T> IntoIterator for SepticBlock<T> {
+    type Item = T;
+    type IntoIter = std::array::IntoIter<T, 7>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.into_iter()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_baby_bear::BabyBear;
+
+    use super::*;
+
+    #[test]
+    fn test_mul() {
+        let a: SepticExtension<BabyBear> = SepticExtension::from_canonical_u32(1);
+        let b: SepticExtension<BabyBear> = SepticExtension::from_canonical_u32(2);
+        let c = a * b;
+        println!("{c}");
+    }
+
+    #[test]
+    fn test_inv() {
+        for i in 0..256 {
+            let a: SepticExtension<BabyBear> = SepticExtension([
+                BabyBear::from_canonical_u32(i + 3),
+                BabyBear::from_canonical_u32(2 * i + 6),
+                BabyBear::from_canonical_u32(5 * i + 17),
+                BabyBear::from_canonical_u32(6 * i + 91),
+                BabyBear::from_canonical_u32(8 * i + 37),
+                BabyBear::from_canonical_u32(11 * i + 35),
+                BabyBear::from_canonical_u32(14 * i + 33),
+            ]);
+            let b = a.inv();
+            assert_eq!(a * b, SepticExtension::<BabyBear>::one());
+        }
+    }
+
+    #[test]
+    fn test_legendre() {
+        let a: SepticExtension<BabyBear> = SepticExtension::generator();
+        let mut b = SepticExtension::<BabyBear>::one();
+        for i in 1..256 {
+            b *= a;
+            let (_, c) = b.is_square();
+            assert!(c == (i % 2 == 0));
+        }
+    }
+
+    #[test]
+    fn test_sqrt() {
+        for i in 0..256 {
+            let a: SepticExtension<BabyBear> = SepticExtension([
+                BabyBear::from_canonical_u32(i + 3),
+                BabyBear::from_canonical_u32(2 * i + 6),
+                BabyBear::from_canonical_u32(5 * i + 17),
+                BabyBear::from_canonical_u32(6 * i + 91),
+                BabyBear::from_canonical_u32(8 * i + 37),
+                BabyBear::from_canonical_u32(11 * i + 35),
+                BabyBear::from_canonical_u32(14 * i + 33),
+            ]);
+            let b = a * a;
+            let recovered_a = b.sqrt().unwrap();
+            assert_eq!(recovered_a * recovered_a, b);
+        }
+        let mut b = SepticExtension::<BabyBear>::one();
+        for i in 1..256 {
+            let a: SepticExtension<BabyBear> = SepticExtension::generator();
+            b *= a;
+            let c = b.sqrt();
+            if i % 2 == 1 {
+                assert!(c.is_none());
+            } else {
+                let c = c.unwrap();
+                assert_eq!(c * c, b);
+            }
+        }
+    }
+}
diff --git a/crates/stark/src/types.rs b/crates/stark/src/types.rs
index 533a8006da..cff129e364 100644
--- a/crates/stark/src/types.rs
+++ b/crates/stark/src/types.rs
@@ -13,7 +13,7 @@ use p3_matrix::{
 use serde::{Deserialize, Serialize};
 
 use super::{Challenge, Com, OpeningProof, StarkGenericConfig, Val};
-use crate::air::InteractionScope;
+use crate::septic_digest::SepticDigest;
 
 pub type QuotientOpenedValues<T> = Vec<T>;
 
@@ -39,8 +39,7 @@ impl<SC: StarkGenericConfig, M, P> ShardMainData<SC, M, P> {
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ShardCommitment<C> {
-    pub global_main_commit: C,
-    pub local_main_commit: C,
+    pub main_commit: C,
     pub permutation_commit: C,
     pub quotient_commit: C,
 }
@@ -54,33 +53,33 @@ pub struct AirOpenedValues<T> {
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(bound(serialize = "T: Serialize"))]
-#[serde(bound(deserialize = "T: Deserialize<'de>"))]
-pub struct ChipOpenedValues<T> {
-    pub preprocessed: AirOpenedValues<T>,
-    pub main: AirOpenedValues<T>,
-    pub permutation: AirOpenedValues<T>,
-    pub quotient: Vec<Vec<T>>,
-    pub global_cumulative_sum: T,
-    pub local_cumulative_sum: T,
+#[serde(bound(serialize = "F: Serialize, EF: Serialize"))]
+#[serde(bound(deserialize = "F: Deserialize<'de>, EF: Deserialize<'de>"))]
+pub struct ChipOpenedValues<F, EF> {
+    pub preprocessed: AirOpenedValues<EF>,
+    pub main: AirOpenedValues<EF>,
+    pub permutation: AirOpenedValues<EF>,
+    pub quotient: Vec<Vec<EF>>,
+    pub global_cumulative_sum: SepticDigest<F>,
+    pub local_cumulative_sum: EF,
     pub log_degree: usize,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ShardOpenedValues<T> {
-    pub chips: Vec<ChipOpenedValues<T>>,
+pub struct ShardOpenedValues<F, EF> {
+    pub chips: Vec<ChipOpenedValues<F, EF>>,
 }
 
 /// The maximum number of elements that can be stored in the public values vec.  Both SP1 and
 /// recursive proofs need to pad their public values vec to this length.  This is required since the
 /// recursion verification program expects the public values vec to be fixed length.
-pub const PROOF_MAX_NUM_PVS: usize = 371;
+pub const PROOF_MAX_NUM_PVS: usize = 231;
 
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(bound = "")]
 pub struct ShardProof<SC: StarkGenericConfig> {
     pub commitment: ShardCommitment<Com<SC>>,
-    pub opened_values: ShardOpenedValues<Challenge<SC>>,
+    pub opened_values: ShardOpenedValues<Val<SC>, Challenge<SC>>,
     pub opening_proof: OpeningProof<SC>,
     pub chip_ordering: HashMap<String, usize>,
     pub public_values: Vec<Val<SC>>,
@@ -93,18 +92,22 @@ pub struct ProofShape {
 
 impl ProofShape {
     #[must_use]
-    pub fn from_traces<V: Clone + Send + Sync>(
-        global_traces: Option<&[(String, RowMajorMatrix<V>)]>,
-        local_traces: &[(String, RowMajorMatrix<V>)],
-    ) -> Self {
-        global_traces
-            .into_iter()
-            .flatten()
-            .chain(local_traces.iter())
+    pub fn from_traces<V: Clone + Send + Sync>(traces: &[(String, RowMajorMatrix<V>)]) -> Self {
+        traces
+            .iter()
             .map(|(name, trace)| (name.clone(), trace.height().ilog2() as usize))
             .sorted_by_key(|(_, height)| *height)
             .collect()
     }
+
+    #[must_use]
+    pub fn from_log2_heights(traces: &[(String, usize)]) -> Self {
+        traces
+            .iter()
+            .map(|(name, height)| (name.clone(), *height))
+            .sorted_by_key(|(_, height)| *height)
+            .collect()
+    }
 }
 
 impl<SC: StarkGenericConfig> Debug for ShardProof<SC> {
@@ -123,15 +126,12 @@ impl<T: Send + Sync + Clone> AirOpenedValues<T> {
 }
 
 impl<SC: StarkGenericConfig> ShardProof<SC> {
-    pub fn cumulative_sum(&self, scope: InteractionScope) -> Challenge<SC> {
-        self.opened_values
-            .chips
-            .iter()
-            .map(|c| match scope {
-                InteractionScope::Global => c.global_cumulative_sum,
-                InteractionScope::Local => c.local_cumulative_sum,
-            })
-            .sum()
+    pub fn local_cumulative_sum(&self) -> Challenge<SC> {
+        self.opened_values.chips.iter().map(|c| c.local_cumulative_sum).sum()
+    }
+
+    pub fn global_cumulative_sum(&self) -> SepticDigest<Val<SC>> {
+        self.opened_values.chips.iter().map(|c| c.global_cumulative_sum).sum()
     }
 
     pub fn log_degree_cpu(&self) -> usize {
diff --git a/crates/stark/src/verifier.rs b/crates/stark/src/verifier.rs
index 2fb3e8cef3..28e80612e2 100644
--- a/crates/stark/src/verifier.rs
+++ b/crates/stark/src/verifier.rs
@@ -33,7 +33,6 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
         chips: &[&MachineChip<SC, A>],
         challenger: &mut SC::Challenger,
         proof: &ShardProof<SC>,
-        global_permutation_challenges: &[SC::Challenge],
     ) -> Result<(), VerificationError<SC>>
     where
         A: for<'a> Air<VerifierConstraintFolder<'a, SC>>,
@@ -55,8 +54,6 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
             return Err(VerificationError::ChipOpeningLengthMismatch);
         }
 
-        let chip_scopes = chips.iter().map(|chip| chip.commit_scope()).collect::<Vec<_>>();
-
         // Assert that the byte multiplicities don't overflow.
         let mut max_byte_lookup_mult = 0u64;
         chips.iter().zip(opened_values.chips.iter()).for_each(|(chip, val)| {
@@ -84,14 +81,9 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
             .map(|log_degree| pcs.natural_domain_for_degree(1 << log_degree))
             .collect::<Vec<_>>();
 
-        let ShardCommitment {
-            global_main_commit,
-            local_main_commit,
-            permutation_commit,
-            quotient_commit,
-        } = commitment;
+        let ShardCommitment { main_commit, permutation_commit, quotient_commit } = commitment;
 
-        challenger.observe(local_main_commit.clone());
+        challenger.observe(main_commit.clone());
 
         let local_permutation_challenges =
             (0..2).map(|_| challenger.sample_ext_element::<SC::Challenge>()).collect::<Vec<_>>();
@@ -100,21 +92,19 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
         // Observe the cumulative sums and constrain any sum without a corresponding scope to be
         // zero.
         for (opening, chip) in opened_values.chips.iter().zip_eq(chips.iter()) {
-            let global_sum = opening.global_cumulative_sum;
             let local_sum = opening.local_cumulative_sum;
-            challenger.observe_slice(global_sum.as_base_slice());
+            let global_sum = opening.global_cumulative_sum;
+
             challenger.observe_slice(local_sum.as_base_slice());
+            challenger.observe_slice(&global_sum.0.x.0);
+            challenger.observe_slice(&global_sum.0.y.0);
 
-            let has_global_interactions = chip
-                .sends()
-                .iter()
-                .chain(chip.receives())
-                .any(|i| i.scope == InteractionScope::Global);
-            if !has_global_interactions && !global_sum.is_zero() {
+            if chip.commit_scope() == InteractionScope::Local && !global_sum.is_zero() {
                 return Err(VerificationError::CumulativeSumsError(
-                    "global cumulative sum is non-zero, but no global interactions",
+                    "global cumulative sum is non-zero, but chip is Local",
                 ));
             }
+
             let has_local_interactions = chip
                 .sends()
                 .iter()
@@ -210,47 +200,19 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
             })
             .collect::<Vec<_>>();
 
-        // Split the main_domains_points_and_opens to the global and local chips.
-        let mut global_trace_points_and_openings = Vec::new();
-        let mut local_trace_points_and_openings = Vec::new();
-        for (i, points_and_openings) in
-            main_domains_points_and_opens.clone().into_iter().enumerate()
-        {
-            let scope = chip_scopes[i];
-            if scope == InteractionScope::Global {
-                global_trace_points_and_openings.push(points_and_openings);
-            } else {
-                local_trace_points_and_openings.push(points_and_openings);
-            }
-        }
-
-        let rounds = if !global_trace_points_and_openings.is_empty() {
-            vec![
-                (vk.commit.clone(), preprocessed_domains_points_and_opens),
-                (global_main_commit.clone(), global_trace_points_and_openings),
-                (local_main_commit.clone(), local_trace_points_and_openings),
-                (permutation_commit.clone(), perm_domains_points_and_opens),
-                (quotient_commit.clone(), quotient_domains_points_and_opens),
-            ]
-        } else {
-            vec![
-                (vk.commit.clone(), preprocessed_domains_points_and_opens),
-                (local_main_commit.clone(), local_trace_points_and_openings),
-                (permutation_commit.clone(), perm_domains_points_and_opens),
-                (quotient_commit.clone(), quotient_domains_points_and_opens),
-            ]
-        };
+        let rounds = vec![
+            (vk.commit.clone(), preprocessed_domains_points_and_opens),
+            (main_commit.clone(), main_domains_points_and_opens),
+            (permutation_commit.clone(), perm_domains_points_and_opens),
+            (quotient_commit.clone(), quotient_domains_points_and_opens),
+        ];
 
         config
             .pcs()
             .verify(rounds, opening_proof, challenger)
             .map_err(|e| VerificationError::InvalidopeningArgument(e))?;
 
-        let permutation_challenges = global_permutation_challenges
-            .iter()
-            .chain(local_permutation_challenges.iter())
-            .copied()
-            .collect::<Vec<_>>();
+        let permutation_challenges = local_permutation_challenges;
 
         // Verify the constrtaint evaluations.
         for (chip, trace_domain, qc_domains, values) in
@@ -273,7 +235,7 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
             .map_err(|_| VerificationError::OodEvaluationMismatch(chip.name()))?;
         }
         // Verify that the local cumulative sum is zero.
-        let local_cumulative_sum = proof.cumulative_sum(InteractionScope::Local);
+        let local_cumulative_sum = proof.local_cumulative_sum();
         if local_cumulative_sum != SC::Challenge::zero() {
             return Err(VerificationError::CumulativeSumsError("local cumulative sum is not zero"));
         }
@@ -283,7 +245,7 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
 
     fn verify_opening_shape(
         chip: &MachineChip<SC, A>,
-        opening: &ChipOpenedValues<SC::Challenge>,
+        opening: &ChipOpenedValues<Val<SC>, SC::Challenge>,
     ) -> Result<(), OpeningShapeError> {
         // Verify that the preprocessed width matches the expected value for the chip.
         if opening.preprocessed.local.len() != chip.preprocessed_width() {
@@ -326,7 +288,6 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
                 opening.permutation.next.len(),
             ));
         }
-
         // Verift that the number of quotient chunks matches the expected value for the chip.
         if opening.quotient.len() != chip.quotient_width() {
             return Err(OpeningShapeError::QuotientWidthMismatch(
@@ -352,7 +313,7 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
     #[allow(clippy::needless_pass_by_value)]
     fn verify_constraints(
         chip: &MachineChip<SC, A>,
-        opening: &ChipOpenedValues<SC::Challenge>,
+        opening: &ChipOpenedValues<Val<SC>, SC::Challenge>,
         trace_domain: Domain<SC>,
         qc_domains: Vec<Domain<SC>>,
         zeta: SC::Challenge,
@@ -389,7 +350,7 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
     /// Evaluates the constraints for a chip and opening.
     pub fn eval_constraints(
         chip: &MachineChip<SC, A>,
-        opening: &ChipOpenedValues<SC::Challenge>,
+        opening: &ChipOpenedValues<Val<SC>, SC::Challenge>,
         selectors: &LagrangeSelectors<SC::Challenge>,
         alpha: SC::Challenge,
         permutation_challenges: &[SC::Challenge],
@@ -412,14 +373,13 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
             next: unflatten(&opening.permutation.next),
         };
 
-        let cumulative_sums = [opening.global_cumulative_sum, opening.local_cumulative_sum];
-        let cumulative_sums = cumulative_sums.as_slice();
         let mut folder = VerifierConstraintFolder::<SC> {
             preprocessed: opening.preprocessed.view(),
             main: opening.main.view(),
             perm: perm_opening.view(),
             perm_challenges: permutation_challenges,
-            cumulative_sums,
+            local_cumulative_sum: &opening.local_cumulative_sum,
+            global_cumulative_sum: &opening.global_cumulative_sum,
             is_first_row: selectors.is_first_row,
             is_last_row: selectors.is_last_row,
             is_transition: selectors.is_transition,
@@ -436,7 +396,7 @@ impl<SC: StarkGenericConfig, A: MachineAir<Val<SC>>> Verifier<SC, A> {
 
     /// Recomputes the quotient for a chip and opening.
     pub fn recompute_quotient(
-        opening: &ChipOpenedValues<SC::Challenge>,
+        opening: &ChipOpenedValues<Val<SC>, SC::Challenge>,
         qc_domains: &[Domain<SC>],
         zeta: SC::Challenge,
     ) -> SC::Challenge {
diff --git a/crates/test-artifacts/programs/Cargo.lock b/crates/test-artifacts/programs/Cargo.lock
index b404318c5a..b4d8436fce 100644
--- a/crates/test-artifacts/programs/Cargo.lock
+++ b/crates/test-artifacts/programs/Cargo.lock
@@ -16,15 +16,15 @@ dependencies = [
 
 [[package]]
 name = "allocator-api2"
-version = "0.2.18"
+version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
 
 [[package]]
 name = "anyhow"
-version = "1.0.89"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 
 [[package]]
 name = "arrayref"
@@ -107,7 +107,7 @@ version = "1.1.0"
 dependencies = [
  "common-test-utils",
  "sp1-curves",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-zkvm",
 ]
 
@@ -150,7 +150,7 @@ name = "bls12381-mul-test"
 version = "1.1.0"
 dependencies = [
  "sp1-derive",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-zkvm",
 ]
 
@@ -160,7 +160,7 @@ version = "1.1.0"
 dependencies = [
  "common-test-utils",
  "sp1-curves",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-zkvm",
 ]
 
@@ -203,7 +203,7 @@ name = "bn254-mul-test"
 version = "1.1.0"
 dependencies = [
  "sp1-derive",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-zkvm",
 ]
 
@@ -215,9 +215,9 @@ checksum = "c3ac9f8b63eca6fd385229b3675f6cc0dc5c8a5c8a54a59d4f52ffd670d87b0c"
 
 [[package]]
 name = "bytemuck"
-version = "1.18.0"
+version = "1.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94bbb0ad554ad961ddc5da507a12a29b14e4ae5bda06b19f575a3e6079d2e2ae"
+checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a"
 
 [[package]]
 name = "byteorder"
@@ -227,9 +227,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.7.2"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3"
+checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
 dependencies = [
  "serde",
 ]
@@ -245,7 +245,7 @@ name = "common-test-utils"
 version = "1.1.0"
 dependencies = [
  "num-bigint",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
 ]
 
 [[package]]
@@ -268,9 +268,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.14"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0"
+checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3"
 dependencies = [
  "libc",
 ]
@@ -320,9 +320,9 @@ dependencies = [
 
 [[package]]
 name = "crypto-bigint"
-version = "0.6.0-rc.5"
+version = "0.6.0-rc.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "040a95c58773f47c92f5f17814702bfd68e8ace9ddce4690c982d0019cac32e2"
+checksum = "d748d1f5b807ee6d0df5a548d0130417295c3aaed1dcbbb3d6a2e7106e11fcca"
 dependencies = [
  "hybrid-array",
  "num-traits",
@@ -344,31 +344,16 @@ dependencies = [
 [[package]]
 name = "curve25519-dalek"
 version = "4.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "curve25519-dalek-derive 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
- "fiat-crypto",
- "rustc_version",
- "subtle",
- "zeroize",
-]
-
-[[package]]
-name = "curve25519-dalek"
-version = "4.1.3"
-source = "git+https://github.com/sp1-patches/curve25519-dalek?branch=patch-curve25519-v4.1.3#1d73fd95f1a76bee8f46643cf78bbccc1fb06ede"
+source = "git+https://github.com/sp1-patches/curve25519-dalek?branch=patch-curve25519-v4.1.3#a9d46282f5660dfb7e3850ef957fe884089daeda"
 dependencies = [
  "anyhow",
  "cfg-if",
  "cpufeatures",
- "curve25519-dalek-derive 0.1.1 (git+https://github.com/sp1-patches/curve25519-dalek?branch=patch-curve25519-v4.1.3)",
+ "curve25519-dalek-derive",
  "digest 0.10.7",
  "fiat-crypto",
  "rustc_version",
- "sp1-lib 1.2.0",
+ "sp1-lib 3.4.0",
  "subtle",
  "zeroize",
 ]
@@ -376,22 +361,11 @@ dependencies = [
 [[package]]
 name = "curve25519-dalek-derive"
 version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.87",
-]
-
-[[package]]
-name = "curve25519-dalek-derive"
-version = "0.1.1"
-source = "git+https://github.com/sp1-patches/curve25519-dalek?branch=patch-curve25519-v4.1.3#1d73fd95f1a76bee8f46643cf78bbccc1fb06ede"
+source = "git+https://github.com/sp1-patches/curve25519-dalek?branch=patch-curve25519-v4.1.3#a9d46282f5660dfb7e3850ef957fe884089daeda"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -404,7 +378,7 @@ dependencies = [
  "cfg-if",
  "digest 0.9.0",
  "rand_core",
- "sp1-lib 3.2.0",
+ "sp1-lib 3.4.0",
  "subtle-ng",
  "zeroize",
 ]
@@ -533,7 +507,27 @@ checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
+]
+
+[[package]]
+name = "derive_more"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
+dependencies = [
+ "derive_more-impl",
+]
+
+[[package]]
+name = "derive_more-impl"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -611,9 +605,9 @@ dependencies = [
 [[package]]
 name = "ed25519-dalek"
 version = "2.1.1"
-source = "git+https://github.com/sp1-patches/curve25519-dalek?branch=patch-curve25519-v4.1.3#1d73fd95f1a76bee8f46643cf78bbccc1fb06ede"
+source = "git+https://github.com/sp1-patches/curve25519-dalek?branch=patch-curve25519-v4.1.3#a9d46282f5660dfb7e3850ef957fe884089daeda"
 dependencies = [
- "curve25519-dalek 4.1.3 (git+https://github.com/sp1-patches/curve25519-dalek?branch=patch-curve25519-v4.1.3)",
+ "curve25519-dalek",
  "ed25519",
  "serde",
  "sha2 0.10.8",
@@ -663,7 +657,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cc43715037532dc2d061e5c97e81b684c28993d52a4fa4eb7d2ce2826d78f2f2"
 dependencies = [
  "base16ct",
- "crypto-bigint 0.6.0-rc.5",
+ "crypto-bigint 0.6.0-rc.6",
  "hybrid-array",
  "rand_core",
  "sec1 0.8.0-rc.3",
@@ -711,9 +705,9 @@ dependencies = [
 
 [[package]]
 name = "futures"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -725,9 +719,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -735,33 +729,33 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
 
 [[package]]
 name = "futures-io"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
 
 [[package]]
 name = "futures-sink"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
 
 [[package]]
 name = "futures-task"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
 
 [[package]]
 name = "futures-util"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -789,9 +783,9 @@ dependencies = [
 
 [[package]]
 name = "generic-array"
-version = "1.1.0"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96512db27971c2c3eece70a1e106fbe6c87760234e31e8f7e5634912fe52794a"
+checksum = "2cb8bc4c28d15ade99c7e90b219f30da4be5c88e586277e8cbe886beeb868ab2"
 dependencies = [
  "serde",
  "typenum",
@@ -832,9 +826,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.15.0"
+version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
 
 [[package]]
 name = "heck"
@@ -882,23 +876,23 @@ dependencies = [
 
 [[package]]
 name = "impl-trait-for-tuples"
-version = "0.2.2"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11d7a9f6330b71fea57921c9b61c47ee6e84f72d394754eff6163ae67e7395eb"
+checksum = "a0eb5a3343abf848c0984fe4604b2b105da9539376e24fc0a3b0007411ae4fd9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "indexmap"
-version = "2.6.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
+checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f"
 dependencies = [
  "equivalent",
- "hashbrown 0.15.0",
+ "hashbrown 0.15.2",
 ]
 
 [[package]]
@@ -921,9 +915,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.11"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
+checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
 
 [[package]]
 name = "k256"
@@ -962,15 +956,15 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 
 [[package]]
 name = "libc"
-version = "0.2.159"
+version = "0.2.167"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
+checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
 
 [[package]]
 name = "libm"
-version = "0.2.8"
+version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
 
 [[package]]
 name = "memchr"
@@ -1094,12 +1088,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.20.1"
+version = "1.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1"
-dependencies = [
- "portable-atomic",
-]
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 
 [[package]]
 name = "opaque-debug"
@@ -1122,7 +1113,7 @@ dependencies = [
 [[package]]
 name = "p3-air"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -1131,7 +1122,7 @@ dependencies = [
 [[package]]
 name = "p3-baby-bear"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "num-bigint",
  "p3-field",
@@ -1145,7 +1136,7 @@ dependencies = [
 [[package]]
 name = "p3-challenger"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-maybe-rayon",
@@ -1158,7 +1149,7 @@ dependencies = [
 [[package]]
 name = "p3-commit"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-challenger",
@@ -1171,7 +1162,7 @@ dependencies = [
 [[package]]
 name = "p3-dft"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -1183,7 +1174,7 @@ dependencies = [
 [[package]]
 name = "p3-field"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "num-bigint",
@@ -1196,7 +1187,7 @@ dependencies = [
 [[package]]
 name = "p3-fri"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-challenger",
@@ -1214,7 +1205,7 @@ dependencies = [
 [[package]]
 name = "p3-interpolation"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -1224,7 +1215,7 @@ dependencies = [
 [[package]]
 name = "p3-matrix"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-field",
@@ -1238,7 +1229,7 @@ dependencies = [
 [[package]]
 name = "p3-maybe-rayon"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "rayon",
 ]
@@ -1246,7 +1237,7 @@ dependencies = [
 [[package]]
 name = "p3-mds"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-dft",
@@ -1260,7 +1251,7 @@ dependencies = [
 [[package]]
 name = "p3-merkle-tree"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-commit",
@@ -1276,7 +1267,7 @@ dependencies = [
 [[package]]
 name = "p3-poseidon2"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "gcd",
  "p3-field",
@@ -1289,7 +1280,7 @@ dependencies = [
 [[package]]
 name = "p3-symmetric"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-field",
@@ -1299,7 +1290,7 @@ dependencies = [
 [[package]]
 name = "p3-uni-stark"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-air",
@@ -1317,7 +1308,7 @@ dependencies = [
 [[package]]
 name = "p3-util"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "serde",
 ]
@@ -1370,9 +1361,9 @@ dependencies = [
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.14"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
+checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff"
 
 [[package]]
 name = "pin-utils"
@@ -1390,12 +1381,6 @@ dependencies = [
  "spki",
 ]
 
-[[package]]
-name = "portable-atomic"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2"
-
 [[package]]
 name = "powerfmt"
 version = "0.2.0"
@@ -1431,9 +1416,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.86"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
@@ -1458,7 +1443,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1567,9 +1552,9 @@ dependencies = [
 
 [[package]]
 name = "rustversion"
-version = "1.0.17"
+version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
+checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248"
 
 [[package]]
 name = "ryu"
@@ -1579,26 +1564,26 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 
 [[package]]
 name = "scale-info"
-version = "2.11.3"
+version = "2.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eca070c12893629e2cc820a9761bedf6ce1dcddc9852984d1dc734b8bd9bd024"
+checksum = "346a3b32eba2640d17a9cb5927056b08f3de90f65b72fe09402c2ad07d684d0b"
 dependencies = [
  "cfg-if",
- "derive_more",
+ "derive_more 1.0.0",
  "parity-scale-codec",
  "scale-info-derive",
 ]
 
 [[package]]
 name = "scale-info-derive"
-version = "2.11.3"
+version = "2.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d35494501194174bda522a32605929eefc9ecf7e0a326c26db1fdd85881eb62"
+checksum = "c6630024bf739e2179b91fb424b28898baf819414262c5d376677dbff1fe7ebf"
 dependencies = [
  "proc-macro-crate",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1634,7 +1619,7 @@ version = "1.1.0"
 dependencies = [
  "common-test-utils",
  "sp1-curves",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-zkvm",
 ]
 
@@ -1670,7 +1655,7 @@ dependencies = [
  "num",
  "p256",
  "sp1-curves",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-zkvm",
 ]
 
@@ -1691,7 +1676,7 @@ dependencies = [
  "num",
  "p256",
  "sp1-curves",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-zkvm",
 ]
 
@@ -1703,9 +1688,9 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
 
 [[package]]
 name = "serde"
-version = "1.0.210"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
+checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
 dependencies = [
  "serde_derive",
 ]
@@ -1721,20 +1706,20 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.210"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
+checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.132"
+version = "1.0.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
+checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
 dependencies = [
  "itoa",
  "memchr",
@@ -1750,7 +1735,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1832,13 +1817,13 @@ dependencies = [
 
 [[package]]
 name = "sp1-curves"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "cfg-if",
- "curve25519-dalek 4.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
  "dashu",
+ "ecdsa",
  "elliptic-curve 0.13.8",
- "generic-array 1.1.0",
+ "generic-array 1.1.1",
  "itertools 0.13.0",
  "k256",
  "num",
@@ -1853,7 +1838,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-derive"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "quote",
  "syn 1.0.109",
@@ -1861,31 +1846,17 @@ dependencies = [
 
 [[package]]
 name = "sp1-lib"
-version = "1.2.0"
+version = "3.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bea7811abd2d3a991007fcb284f41152840b8388c171288d0c52c6793956609c"
+checksum = "7a5729da1b05d56c01457e5ecabdc77f1cc941df23f2921163a2f325aec22428"
 dependencies = [
- "anyhow",
  "bincode",
- "cfg-if",
- "hex",
  "serde",
- "snowbridge-amcl",
 ]
 
 [[package]]
 name = "sp1-lib"
-version = "3.0.0"
-dependencies = [
- "bincode",
- "serde",
-]
-
-[[package]]
-name = "sp1-lib"
-version = "3.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1aa18834c58df127706eb2fb2ea6e2892dbf0361d6b2485bf7b3fbd5f8b8c3c"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "serde",
@@ -1893,7 +1864,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-primitives"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "hex",
@@ -1909,11 +1880,12 @@ dependencies = [
 
 [[package]]
 name = "sp1-stark"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "arrayref",
  "hashbrown 0.14.5",
  "itertools 0.13.0",
+ "num-bigint",
  "num-traits",
  "p3-air",
  "p3-baby-bear",
@@ -1941,7 +1913,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-zkvm"
-version = "3.0.1"
+version = "4.0.0-rc.2"
 dependencies = [
  "cfg-if",
  "getrandom",
@@ -1951,7 +1923,7 @@ dependencies = [
  "p3-field",
  "rand",
  "sha2 0.10.8",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-primitives",
 ]
 
@@ -1987,7 +1959,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2024,9 +1996,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.87"
+version = "2.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
+checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2094,7 +2066,7 @@ version = "0.34.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b8090d0eef9ad57b1b913b5e358e26145c86017e87338136509b94383a4af25"
 dependencies = [
- "derive_more",
+ "derive_more 0.99.18",
  "flex-error",
  "serde",
  "tendermint",
@@ -2121,9 +2093,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21"
 dependencies = [
  "deranged",
  "num-conv",
@@ -2140,9 +2112,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de"
 dependencies = [
  "num-conv",
  "time-core",
@@ -2176,9 +2148,9 @@ dependencies = [
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
  "pin-project-lite",
  "tracing-attributes",
@@ -2187,20 +2159,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
 ]
@@ -2227,16 +2199,16 @@ dependencies = [
 name = "uint256-arith-program"
 version = "1.1.0"
 dependencies = [
- "crypto-bigint 0.6.0-rc.5",
+ "crypto-bigint 0.6.0-rc.6",
  "sp1-derive",
  "sp1-zkvm",
 ]
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.13"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "verify-proof"
@@ -2391,7 +2363,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2411,5 +2383,5 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
diff --git a/crates/verifier/Cargo.toml b/crates/verifier/Cargo.toml
index c2b861b16d..df157e7043 100644
--- a/crates/verifier/Cargo.toml
+++ b/crates/verifier/Cargo.toml
@@ -12,9 +12,6 @@ categories = { workspace = true }
 [dependencies]
 bn = { version = "0.6.0", package = "substrate-bn-succinct" }
 sha2 = { version = "0.10.8", default-features = false }
-thiserror = { version = "2", default-features = false }
-hex = { version = "0.4.3", default-features = false, features = ["alloc"] }
-lazy_static = { version = "1.5.0", default-features = false }
 
 # arkworks
 ark-bn254 = { version = "0.4.0", optional = true }
@@ -23,6 +20,10 @@ ark-ff = { version = "0.4.2", optional = true }
 ark-groth16 = { version = "0.4.0", optional = true }
 ark-ec = { version = "0.4.0", optional = true }
 
+thiserror-no-std = "2.0.2"
+hex = { version = "0.4.3", default-features = false, features = ["alloc"] }
+lazy_static = { version = "1.5.0", default-features = false }
+
 [dev-dependencies]
 sp1-sdk = { workspace = true }
 num-bigint = "0.4.6"
@@ -30,5 +31,5 @@ num-traits = "0.2.19"
 
 [features]
 default = ["std"]
-std = ["thiserror/std"]
 ark = ["ark-bn254", "ark-serialize", "ark-ff", "ark-groth16", "ark-ec"]
+std = ["thiserror-no-std/std"]
diff --git a/crates/verifier/bn254-vk/groth16_vk.bin b/crates/verifier/bn254-vk/groth16_vk.bin
index 348cc8a0a3..00e64d83ea 100644
Binary files a/crates/verifier/bn254-vk/groth16_vk.bin and b/crates/verifier/bn254-vk/groth16_vk.bin differ
diff --git a/crates/verifier/bn254-vk/plonk_vk.bin b/crates/verifier/bn254-vk/plonk_vk.bin
index c8e98e6fdb..601f822ef0 100644
Binary files a/crates/verifier/bn254-vk/plonk_vk.bin and b/crates/verifier/bn254-vk/plonk_vk.bin differ
diff --git a/crates/verifier/src/error.rs b/crates/verifier/src/error.rs
index 1f30633fde..2d37bceac9 100644
--- a/crates/verifier/src/error.rs
+++ b/crates/verifier/src/error.rs
@@ -1,5 +1,5 @@
 use bn::{CurveError, FieldError, GroupError};
-use thiserror::Error;
+use thiserror_no_std::Error;
 
 #[derive(Error, Debug)]
 pub enum Error {
diff --git a/crates/verifier/src/groth16/ark_converter.rs b/crates/verifier/src/groth16/ark_converter.rs
index 2554585597..176e1945c0 100644
--- a/crates/verifier/src/groth16/ark_converter.rs
+++ b/crates/verifier/src/groth16/ark_converter.rs
@@ -3,7 +3,7 @@ use ark_ec::AffineRepr;
 use ark_ff::PrimeField;
 use ark_groth16::{Proof, VerifyingKey};
 use ark_serialize::{CanonicalDeserialize, Compress, Validate};
-use thiserror::Error;
+use thiserror_no_std::Error;
 
 const GNARK_MASK: u8 = 0b11 << 6;
 const GNARK_COMPRESSED_POSITIVE: u8 = 0b10 << 6;
diff --git a/crates/verifier/src/groth16/converter.rs b/crates/verifier/src/groth16/converter.rs
index 6648eb95c9..3b1898356d 100644
--- a/crates/verifier/src/groth16/converter.rs
+++ b/crates/verifier/src/groth16/converter.rs
@@ -24,8 +24,8 @@ pub(crate) fn load_groth16_proof_from_bytes(buffer: &[u8]) -> Result<Groth16Proo
 
 /// Load the Groth16 verification key from the given byte slice.
 ///
-/// The gnark verification key includes a lot of extraneous information. We only extract the necessary
-/// elements to verify a proof.
+/// The gnark verification key includes a lot of extraneous information. We only extract the
+/// necessary elements to verify a proof.
 pub(crate) fn load_groth16_verifying_key_from_bytes(
     buffer: &[u8],
 ) -> Result<Groth16VerifyingKey, Groth16Error> {
diff --git a/crates/verifier/src/groth16/error.rs b/crates/verifier/src/groth16/error.rs
index 18d8e2dcbe..36952cb749 100644
--- a/crates/verifier/src/groth16/error.rs
+++ b/crates/verifier/src/groth16/error.rs
@@ -1,4 +1,4 @@
-use thiserror::Error;
+use thiserror_no_std::Error;
 
 #[derive(Debug, Error)]
 pub enum Groth16Error {
diff --git a/crates/verifier/src/groth16/mod.rs b/crates/verifier/src/groth16/mod.rs
index 9075d6491b..3fd50d4465 100644
--- a/crates/verifier/src/groth16/mod.rs
+++ b/crates/verifier/src/groth16/mod.rs
@@ -26,8 +26,7 @@ impl Groth16Verifier {
     ///
     /// * `proof` - The proof bytes.
     /// * `public_inputs` - The SP1 public inputs.
-    /// * `sp1_vkey_hash` - The SP1 vkey hash.
-    ///   This is generated in the following manner:
+    /// * `sp1_vkey_hash` - The SP1 vkey hash. This is generated in the following manner:
     ///
     /// ```ignore
     /// use sp1_sdk::ProverClient;
@@ -35,9 +34,9 @@ impl Groth16Verifier {
     /// let (pk, vk) = client.setup(ELF);
     /// let sp1_vkey_hash = vk.bytes32();
     /// ```
-    /// * `groth16_vk` - The Groth16 verifying key bytes.
-    ///   Usually this will be the [`static@crate::GROTH16_VK_BYTES`] constant, which is the Groth16
-    ///   verifying key for the current SP1 version.
+    /// * `groth16_vk` - The Groth16 verifying key bytes. Usually this will be the
+    ///   [`static@crate::GROTH16_VK_BYTES`] constant, which is the Groth16 verifying key for the
+    ///   current SP1 version.
     ///
     /// # Returns
     ///
@@ -53,8 +52,8 @@ impl Groth16Verifier {
             .try_into()
             .map_err(|_| Groth16Error::GeneralError(Error::InvalidData))?;
 
-        // Check to make sure that this proof was generated by the groth16 proving key corresponding to
-        // the given groth16_vk.
+        // Check to make sure that this proof was generated by the groth16 proving key corresponding
+        // to the given groth16_vk.
         //
         // SP1 prepends the raw Groth16 proof with the first 4 bytes of the groth16 vkey to
         // facilitate this check.
diff --git a/crates/verifier/src/lib.rs b/crates/verifier/src/lib.rs
index 2220e60cb0..351975754a 100644
--- a/crates/verifier/src/lib.rs
+++ b/crates/verifier/src/lib.rs
@@ -23,8 +23,7 @@ mod error;
 mod utils;
 pub use utils::*;
 
-pub use groth16::error::Groth16Error;
-pub use groth16::Groth16Verifier;
+pub use groth16::{error::Groth16Error, Groth16Verifier};
 mod groth16;
 
 #[cfg(feature = "ark")]
diff --git a/crates/verifier/src/plonk/error.rs b/crates/verifier/src/plonk/error.rs
index e744cb844a..1d33e503d6 100644
--- a/crates/verifier/src/plonk/error.rs
+++ b/crates/verifier/src/plonk/error.rs
@@ -1,4 +1,4 @@
-use thiserror::Error;
+use thiserror_no_std::Error;
 
 #[derive(Error, Debug)]
 pub enum PlonkError {
diff --git a/crates/verifier/src/plonk/hash_to_field.rs b/crates/verifier/src/plonk/hash_to_field.rs
index fb077019b1..c1b18e7b4e 100644
--- a/crates/verifier/src/plonk/hash_to_field.rs
+++ b/crates/verifier/src/plonk/hash_to_field.rs
@@ -1,5 +1,4 @@
-use alloc::vec;
-use alloc::vec::Vec;
+use alloc::{vec, vec::Vec};
 use core::hash::Hasher;
 use sha2::Digest;
 
diff --git a/crates/verifier/src/plonk/mod.rs b/crates/verifier/src/plonk/mod.rs
index 613c2cab00..d2b02e3f72 100644
--- a/crates/verifier/src/plonk/mod.rs
+++ b/crates/verifier/src/plonk/mod.rs
@@ -34,8 +34,7 @@ impl PlonkVerifier {
     ///
     /// * `proof` - The proof bytes.
     /// * `public_inputs` - The SP1 public inputs.
-    /// * `sp1_vkey_hash` - The SP1 vkey hash.
-    ///   This is generated in the following manner:
+    /// * `sp1_vkey_hash` - The SP1 vkey hash. This is generated in the following manner:
     ///
     /// ```ignore
     /// use sp1_sdk::ProverClient;
@@ -43,8 +42,8 @@ impl PlonkVerifier {
     /// let (pk, vk) = client.setup(ELF);
     /// let sp1_vkey_hash = vk.bytes32();
     /// ```
-    /// * `plonk_vk` - The Plonk verifying key bytes.
-    ///   Usually this will be the [`static@crate::PLONK_VK_BYTES`] constant.
+    /// * `plonk_vk` - The Plonk verifying key bytes. Usually this will be the
+    ///   [`static@crate::PLONK_VK_BYTES`] constant.
     ///
     /// # Returns
     ///
@@ -60,8 +59,8 @@ impl PlonkVerifier {
             .try_into()
             .map_err(|_| PlonkError::GeneralError(Error::InvalidData))?;
 
-        // Check to make sure that this proof was generated by the plonk proving key corresponding to
-        // the given plonk vk.
+        // Check to make sure that this proof was generated by the plonk proving key corresponding
+        // to the given plonk vk.
         //
         // SP1 prepends the raw Plonk proof with the first 4 bytes of the plonk vkey to
         // facilitate this check.
diff --git a/crates/verifier/src/plonk/verify.rs b/crates/verifier/src/plonk/verify.rs
index 6da0872d6e..3ba28282a9 100644
--- a/crates/verifier/src/plonk/verify.rs
+++ b/crates/verifier/src/plonk/verify.rs
@@ -54,7 +54,8 @@ pub(crate) fn verify_plonk_algebraic(
         return Err(PlonkError::Bsb22CommitmentMismatch);
     }
 
-    // Check if the number of public inputs matches the number of public variables in the verifying key
+    // Check if the number of public inputs matches the number of public variables in the verifying
+    // key
     if public_inputs.len() != vk.nb_public_variables {
         return Err(PlonkError::InvalidWitness);
     }
@@ -266,8 +267,8 @@ pub(crate) fn verify_plonk_algebraic(
     scalars.push(zeta_n_plus_two_square_zh);
 
     // Compute the linearized polynomial digest:
-    // α²*L₁(ζ)*[Z] + _s1*[s3]+_s2*[Z] + l(ζ)*[Ql] + l(ζ)r(ζ)*[Qm] + r(ζ)*[Qr] + o(ζ)*[Qo] + [Qk] + ∑ᵢQcp_(ζ)[Pi_i] -
-    // Z_{H}(ζ)*(([H₀] + ζᵐ⁺²*[H₁] + ζ²⁽ᵐ⁺²⁾*[H₂])
+    // α²*L₁(ζ)*[Z] + _s1*[s3]+_s2*[Z] + l(ζ)*[Ql] + l(ζ)r(ζ)*[Qm] + r(ζ)*[Qr] + o(ζ)*[Qo] + [Qk] +
+    // ∑ᵢQcp_(ζ)[Pi_i] - Z_{H}(ζ)*(([H₀] + ζᵐ⁺²*[H₁] + ζ²⁽ᵐ⁺²⁾*[H₂])
     let linearized_polynomial_digest = AffineG1::msm(&points, &scalars);
 
     // Prepare digests for folding
diff --git a/crates/verifier/test_binaries/fibonacci-groth16.bin b/crates/verifier/test_binaries/fibonacci-groth16.bin
index 72c58644c3..9e67eed47a 100644
Binary files a/crates/verifier/test_binaries/fibonacci-groth16.bin and b/crates/verifier/test_binaries/fibonacci-groth16.bin differ
diff --git a/crates/verifier/test_binaries/fibonacci-plonk.bin b/crates/verifier/test_binaries/fibonacci-plonk.bin
index 303c9ff278..1d7569a806 100644
Binary files a/crates/verifier/test_binaries/fibonacci-plonk.bin and b/crates/verifier/test_binaries/fibonacci-plonk.bin differ
diff --git a/crates/zkvm/entrypoint/Cargo.toml b/crates/zkvm/entrypoint/Cargo.toml
index e38600efd9..320340a7ea 100644
--- a/crates/zkvm/entrypoint/Cargo.toml
+++ b/crates/zkvm/entrypoint/Cargo.toml
@@ -2,7 +2,7 @@
 name = "sp1-zkvm"
 description = "SP1 is a performant, 100% open-source, contributor-friendly zkVM."
 readme = "../../../README.md"
-version = "3.0.1"
+version = { workspace = true }
 edition = { workspace = true }
 license = { workspace = true }
 repository = { workspace = true }
diff --git a/crates/zkvm/lib/src/io.rs b/crates/zkvm/lib/src/io.rs
index 94a92a602c..6ab651fe05 100644
--- a/crates/zkvm/lib/src/io.rs
+++ b/crates/zkvm/lib/src/io.rs
@@ -12,9 +12,12 @@ pub const FD_PUBLIC_VALUES: u32 = 3;
 /// The file descriptor for hints.
 pub const FD_HINT: u32 = 4;
 
-/// The file descriptor for the `ecrecover` hook.
-pub const K1_ECRECOVER_HOOK: u32 = 5;
-pub const R1_ECRECOVER_HOOK: u32 = 6;
+/// The file descriptor through which to access `hook_k1_ecrecover`.
+pub const FD_K1_ECRECOVER_HOOK: u32 = 5;
+/// The file descriptor through which to access `hook_r1_ecrecover`.
+pub const FD_R1_ECRECOVER_HOOK: u32 = 6;
+/// The file descriptor through which to access `hook_ed_decompress`.
+pub const FD_EDDECOMPRESS: u32 = 8;
 
 /// A writer that writes to a file descriptor inside the zkVM.
 struct SyscallWriter {
diff --git a/crates/zkvm/lib/src/utils.rs b/crates/zkvm/lib/src/utils.rs
index 94ae422159..77853230b2 100644
--- a/crates/zkvm/lib/src/utils.rs
+++ b/crates/zkvm/lib/src/utils.rs
@@ -8,7 +8,8 @@ pub trait AffinePoint<const N: usize>: Clone + Sized {
     /// Returns a reference to the limbs.
     fn limbs_ref(&self) -> &[u32; N];
 
-    /// Returns a mutable reference to the limbs. If the point is the infinity point, this will panic.
+    /// Returns a mutable reference to the limbs. If the point is the infinity point, this will
+    /// panic.
     fn limbs_mut(&mut self) -> &mut [u32; N];
 
     /// Creates a new [`AffinePoint`] from the given x and y coordinates.
@@ -48,7 +49,8 @@ pub trait AffinePoint<const N: usize>: Clone + Sized {
     fn add_assign(&mut self, other: &Self);
 
     /// Adds the given [`AffinePoint`] to `self`. Can be optionally overridden to use a different
-    /// implementation of addition in multi-scalar multiplication, which is used in secp256k1 recovery.
+    /// implementation of addition in multi-scalar multiplication, which is used in secp256k1
+    /// recovery.
     fn complete_add_assign(&mut self, other: &Self) {
         self.add_assign(other);
     }
diff --git a/examples/Cargo.lock b/examples/Cargo.lock
index 53dd96cfd5..3cb9d9c3c8 100644
--- a/examples/Cargo.lock
+++ b/examples/Cargo.lock
@@ -68,17 +68,17 @@ dependencies = [
 
 [[package]]
 name = "allocator-api2"
-version = "0.2.18"
+version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
 
 [[package]]
 name = "alloy-chains"
-version = "0.1.47"
+version = "0.1.48"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18c5c520273946ecf715c0010b4e3503d7eba9893cd9ce6b7fff5654c4a3c470"
+checksum = "a0161082e0edd9013d23083465cc04b20e44b7a15646d36ba7b0cdb7cd6fe18f"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "num_enum 0.7.3",
  "serde",
@@ -92,7 +92,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "629b62e38d471cc15fea534eb7283d2f8a4e8bdb1811bcc5d66dda6cfce6fae1"
 dependencies = [
  "alloy-eips 0.3.6",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-serde 0.3.6",
  "c-kzg",
@@ -106,7 +106,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "41ed961a48297c732a5d97ee321aa8bb5009ecadbcb077d8bec90cb54e651629"
 dependencies = [
  "alloy-eips 0.5.4",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-serde 0.5.4",
  "auto_impl",
@@ -121,7 +121,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0069cf0642457f87a01a014f6dc29d5d893cd4fd8fddf0c3cdfad1bb3ebafc41"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "serde",
 ]
@@ -132,7 +132,7 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ea59dc42102bc9a1905dc57901edc6dd48b9f38115df86c7d252acba70d71d04"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "k256",
  "serde",
@@ -144,7 +144,7 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64ffc577390ce50234e02d841214b3dc0bea6aaaae8e04bbf3cb82e9a45da9eb"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "derive_more 1.0.0",
  "serde",
@@ -158,7 +158,7 @@ checksum = "f923dd5fca5f67a43d81ed3ebad0880bd41f6dd0ada930030353ac356c54cd0f"
 dependencies = [
  "alloy-eip2930",
  "alloy-eip7702 0.1.1",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-serde 0.3.6",
  "c-kzg",
@@ -176,7 +176,7 @@ checksum = "b69e06cf9c37be824b9d26d6d101114fdde6af0c87de2828b414c05c4b3daa71"
 dependencies = [
  "alloy-eip2930",
  "alloy-eip7702 0.3.2",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-serde 0.5.4",
  "c-kzg",
@@ -192,18 +192,18 @@ version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3a7a18afb0b318616b6b2b0e2e7ac5529d32a966c673b48091c9919e284e6aca"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-serde 0.3.6",
  "serde",
 ]
 
 [[package]]
 name = "alloy-json-abi"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ded610181f3dad5810f6ff12d1a99994cf9b42d2fcb7709029352398a5da5ae6"
+checksum = "ac4b22b3e51cac09fd2adfcc73b55f447b4df669f983c13f7894ec82b607c63f"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-sol-type-parser",
  "serde",
  "serde_json",
@@ -215,11 +215,11 @@ version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af5979e0d5a7bf9c7eb79749121e8256e59021af611322aee56e77e20776b4b3"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-sol-types",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
 ]
 
@@ -233,7 +233,7 @@ dependencies = [
  "alloy-eips 0.5.4",
  "alloy-json-rpc",
  "alloy-network-primitives 0.5.4",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rpc-types-eth 0.5.4",
  "alloy-serde 0.5.4",
  "alloy-signer",
@@ -241,7 +241,7 @@ dependencies = [
  "async-trait",
  "auto_impl",
  "futures-utils-wasm",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -251,7 +251,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94ad40869867ed2d9cd3842b1e800889e5b49e6b92da346e93862b4a741bedf3"
 dependencies = [
  "alloy-eips 0.3.6",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-serde 0.3.6",
  "serde",
 ]
@@ -264,7 +264,7 @@ checksum = "514f70ee2a953db21631cd817b13a1571474ec77ddc03d47616d5e8203489fde"
 dependencies = [
  "alloy-consensus 0.5.4",
  "alloy-eips 0.5.4",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-serde 0.5.4",
  "serde",
 ]
@@ -293,9 +293,9 @@ dependencies = [
 
 [[package]]
 name = "alloy-primitives"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd58d377699e6cfeab52c4a9d28bdc4ef37e2bd235ff2db525071fe37a2e9af5"
+checksum = "9db948902dfbae96a73c2fbf1f7abec62af034ab883e4c777c3fd29702bd6e2c"
 dependencies = [
  "alloy-rlp",
  "bytes",
@@ -304,9 +304,9 @@ dependencies = [
  "derive_more 1.0.0",
  "foldhash",
  "getrandom",
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
  "hex-literal",
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
  "itoa",
  "k256",
  "keccak-asm",
@@ -314,7 +314,7 @@ dependencies = [
  "proptest",
  "rand 0.8.5",
  "ruint",
- "rustc-hash 2.0.0",
+ "rustc-hash 2.1.0",
  "serde",
  "sha3",
  "tiny-keccak",
@@ -339,7 +339,7 @@ checksum = "2b09cae092c27b6f1bde952653a22708691802e57bfef4a2973b80bea21efd3f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -362,7 +362,7 @@ dependencies = [
  "alloy-consensus 0.3.6",
  "alloy-eips 0.3.6",
  "alloy-network-primitives 0.3.6",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-serde 0.3.6",
  "alloy-sol-types",
@@ -383,7 +383,7 @@ dependencies = [
  "alloy-consensus 0.5.4",
  "alloy-eips 0.5.4",
  "alloy-network-primitives 0.5.4",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-serde 0.5.4",
  "alloy-sol-types",
@@ -399,7 +399,7 @@ version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731f75ec5d383107fd745d781619bd9cedf145836c51ecb991623d41278e71fa"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "serde",
  "serde_json",
 ]
@@ -410,7 +410,7 @@ version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "028e72eaa9703e4882344983cfe7636ce06d8cce104a78ea62fd19b46659efc4"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "serde",
  "serde_json",
 ]
@@ -421,12 +421,12 @@ version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "592c185d7100258c041afac51877660c7bf6213447999787197db4842f0e938e"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "async-trait",
  "auto_impl",
  "elliptic-curve",
  "k256",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -437,66 +437,66 @@ checksum = "6614f02fc1d5b079b2a4a5320018317b506fd0a6d67c1fd5542a71201724986c"
 dependencies = [
  "alloy-consensus 0.5.4",
  "alloy-network",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-signer",
  "async-trait",
  "k256",
  "rand 0.8.5",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
 name = "alloy-sol-macro"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a1b42ac8f45e2f49f4bcdd72cbfde0bb148f5481d403774ffa546e48b83efc1"
+checksum = "3bfd7853b65a2b4f49629ec975fee274faf6dff15ab8894c620943398ef283c0"
 dependencies = [
  "alloy-sol-macro-expander",
  "alloy-sol-macro-input",
  "proc-macro-error2",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "alloy-sol-macro-expander"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06318f1778e57f36333e850aa71bd1bb5e560c10279e236622faae0470c50412"
+checksum = "82ec42f342d9a9261699f8078e57a7a4fda8aaa73c1a212ed3987080e6a9cd13"
 dependencies = [
  "alloy-sol-macro-input",
  "const-hex",
- "heck",
- "indexmap 2.6.0",
+ "heck 0.5.0",
+ "indexmap 2.7.0",
  "proc-macro-error2",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "syn-solidity",
  "tiny-keccak",
 ]
 
 [[package]]
 name = "alloy-sol-macro-input"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eaebb9b0ad61a41345a22c9279975c0cdd231b97947b10d7aad1cf0a7181e4a5"
+checksum = "ed2c50e6a62ee2b4f7ab3c6d0366e5770a21cad426e109c2f40335a1b3aff3df"
 dependencies = [
  "const-hex",
  "dunce",
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "syn-solidity",
 ]
 
 [[package]]
 name = "alloy-sol-type-parser"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12c71028bfbfec210e24106a542aad3def7caf1a70e2c05710e92a98481980d3"
+checksum = "ac17c6e89a50fb4a758012e4b409d9a0ba575228e69b539fe37d7a1bd507ca4a"
 dependencies = [
  "serde",
  "winnow 0.6.20",
@@ -504,12 +504,12 @@ dependencies = [
 
 [[package]]
 name = "alloy-sol-types"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "374d7fb042d68ddfe79ccb23359de3007f6d4d53c13f703b64fb0db422132111"
+checksum = "c9dc0fffe397aa17628160e16b89f704098bf3c9d74d5d369ebc239575936de5"
 dependencies = [
  "alloy-json-abi",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-sol-macro",
  "const-hex",
  "serde",
@@ -521,7 +521,7 @@ version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0a46c9c4fdccda7982e7928904bd85fe235a0404ee3d7e197fff13d61eac8b4f"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "derive_more 1.0.0",
  "hashbrown 0.14.5",
@@ -606,9 +606,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.93"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 
 [[package]]
 name = "ark-ff"
@@ -760,7 +760,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -781,7 +781,7 @@ checksum = "3c87f3f15e7794432337fc718554eaa4dc8f04c9677a950ffe366f20a162ae42"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -792,9 +792,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 
 [[package]]
 name = "axum"
-version = "0.7.7"
+version = "0.7.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "504e3947307ac8326a5437504c517c4b56716c9d98fac0028c2acc7ca47d70ae"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
 dependencies = [
  "async-trait",
  "axum-core",
@@ -816,7 +816,7 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "serde_urlencoded",
- "sync_wrapper 1.0.1",
+ "sync_wrapper 1.0.2",
  "tokio",
  "tower",
  "tower-layer",
@@ -839,7 +839,7 @@ dependencies = [
  "mime",
  "pin-project-lite",
  "rustversion",
- "sync_wrapper 1.0.1",
+ "sync_wrapper 1.0.2",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -911,7 +911,7 @@ dependencies = [
  "regex",
  "rustc-hash 1.1.0",
  "shlex",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1031,7 +1031,7 @@ dependencies = [
  "group 0.13.0",
  "pairing 0.23.0",
  "rand_core 0.6.4",
- "sp1-lib 3.1.0",
+ "sp1-lib 3.4.0",
  "subtle",
 ]
 
@@ -1078,9 +1078,9 @@ checksum = "c3ac9f8b63eca6fd385229b3675f6cc0dc5c8a5c8a54a59d4f52ffd670d87b0c"
 
 [[package]]
 name = "bytemuck"
-version = "1.19.0"
+version = "1.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d"
+checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a"
 dependencies = [
  "bytemuck_derive",
 ]
@@ -1093,7 +1093,7 @@ checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1104,9 +1104,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.8.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da"
+checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
 dependencies = [
  "serde",
 ]
@@ -1137,9 +1137,9 @@ dependencies = [
 
 [[package]]
 name = "cargo-platform"
-version = "0.1.8"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc"
+checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea"
 dependencies = [
  "serde",
 ]
@@ -1155,14 +1155,33 @@ dependencies = [
  "semver 1.0.23",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "cbindgen"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fce8dd7fcfcbf3a0a87d8f515194b49d6135acab73e18bd380d1d93bb1a15eb"
+dependencies = [
+ "clap",
+ "heck 0.4.1",
+ "indexmap 2.7.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.90",
+ "tempfile",
+ "toml",
 ]
 
 [[package]]
 name = "cc"
-version = "1.1.36"
+version = "1.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baee610e9452a8f6f0a1b6194ec09ff9e2d85dea54432acdae41aa0761c95d70"
+checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc"
 dependencies = [
  "jobserver",
  "libc",
@@ -1244,9 +1263,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.20"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
+checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -1254,9 +1273,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.20"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
+checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
 dependencies = [
  "anstream",
  "anstyle",
@@ -1270,17 +1289,17 @@ version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "clap_lex"
-version = "0.7.2"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
 [[package]]
 name = "colorchoice"
@@ -1297,15 +1316,15 @@ dependencies = [
  "encode_unicode",
  "lazy_static",
  "libc",
- "unicode-width",
+ "unicode-width 0.1.14",
  "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "const-hex"
-version = "1.13.1"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0121754e84117e65f9d90648ee6aa4882a6e63110307ab73967a4c5e7e69e586"
+checksum = "4b0485bab839b018a8f1723fc5391819fea5f8f0f32288ef8a735fd096b6160c"
 dependencies = [
  "cfg-if",
  "cpufeatures",
@@ -1355,9 +1374,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.14"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0"
+checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3"
 dependencies = [
  "libc",
 ]
@@ -1377,6 +1396,15 @@ version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
 
+[[package]]
+name = "crossbeam-channel"
+version = "0.5.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-deque"
 version = "0.8.5"
@@ -1453,7 +1481,7 @@ dependencies = [
 [[package]]
 name = "curve25519-dalek"
 version = "4.1.3"
-source = "git+https://github.com/sp1-patches/curve25519-dalek?tag=curve25519_dalek-v4.1.3-patch-v1#dbdd0ffeea0ff767affc3f6765d1edbdaa9e2cb9"
+source = "git+https://github.com/sp1-patches/curve25519-dalek?tag=patch-v4.1.3-v3.4.0#bfe63b8205f0b6baa0c1f4c71da33209f766e3e4"
 dependencies = [
  "anyhow",
  "cfg-if",
@@ -1462,7 +1490,7 @@ dependencies = [
  "digest 0.10.7",
  "fiat-crypto",
  "rustc_version 0.4.1",
- "sp1-lib 3.1.0",
+ "sp1-lib 3.4.0",
  "subtle",
  "zeroize",
 ]
@@ -1470,11 +1498,11 @@ dependencies = [
 [[package]]
 name = "curve25519-dalek-derive"
 version = "0.1.1"
-source = "git+https://github.com/sp1-patches/curve25519-dalek?tag=curve25519_dalek-v4.1.3-patch-v1#dbdd0ffeea0ff767affc3f6765d1edbdaa9e2cb9"
+source = "git+https://github.com/sp1-patches/curve25519-dalek?tag=patch-v4.1.3-v3.4.0#bfe63b8205f0b6baa0c1f4c71da33209f766e3e4"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1487,7 +1515,7 @@ dependencies = [
  "cfg-if",
  "digest 0.9.0",
  "rand_core 0.6.4",
- "sp1-lib 3.1.0",
+ "sp1-lib 3.4.0",
  "subtle-ng",
  "zeroize",
 ]
@@ -1502,7 +1530,7 @@ dependencies = [
  "cfg-if",
  "digest 0.9.0",
  "rand_core 0.6.4",
- "sp1-lib 3.1.0",
+ "sp1-lib 3.4.0",
  "subtle-ng",
  "zeroize",
 ]
@@ -1544,7 +1572,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1555,7 +1583,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1689,7 +1717,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustc_version 0.4.1",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1710,7 +1738,7 @@ dependencies = [
  "convert_case 0.6.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "unicode-xid",
 ]
 
@@ -1764,7 +1792,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1788,29 +1816,17 @@ checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125"
 [[package]]
 name = "ecdsa"
 version = "0.16.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
-dependencies = [
- "der 0.7.9",
- "digest 0.10.7",
- "elliptic-curve",
- "rfc6979",
- "signature",
- "spki 0.7.3",
-]
-
-[[package]]
-name = "ecdsa"
-version = "0.16.9"
-source = "git+https://github.com/sp1-patches/signatures?branch=patch-ecdsa-v0.16.9#475daa8834035cc170a567e7656329ab8de8cc44"
+source = "git+https://github.com/sp1-patches/signatures?branch=ecdsa-v0.16.9-patch-v4.0.0-rc.2#d7e485412a2ac4974c63314ac0bce98473630859"
 dependencies = [
  "anyhow",
  "cfg-if",
+ "der 0.7.9",
  "digest 0.10.7",
  "elliptic-curve",
  "hex-literal",
+ "rfc6979",
  "signature",
- "sp1-lib 3.1.0",
+ "sp1-lib 4.0.0-rc.1",
  "spki 0.7.3",
 ]
 
@@ -1834,7 +1850,7 @@ dependencies = [
  "rand_core 0.6.4",
  "serde",
  "sha2 0.9.9",
- "thiserror",
+ "thiserror 1.0.69",
  "zeroize",
 ]
 
@@ -1924,7 +1940,7 @@ checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1935,7 +1951,7 @@ checksum = "2f9ed6b3789237c8a0c1c505af1c7eb2c560df6186f01b098c3a1064ea532f38"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1946,12 +1962,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.9"
+version = "0.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -1988,9 +2004,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.1.1"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
+checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
 
 [[package]]
 name = "fastrlp"
@@ -2169,7 +2185,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2227,9 +2243,9 @@ dependencies = [
 
 [[package]]
 name = "generic-array"
-version = "1.1.0"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96512db27971c2c3eece70a1e106fbe6c87760234e31e8f7e5634912fe52794a"
+checksum = "2cb8bc4c28d15ade99c7e90b219f30da4be5c88e586277e8cbe886beeb868ab2"
 dependencies = [
  "serde",
  "typenum",
@@ -2242,8 +2258,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
  "cfg-if",
+ "js-sys",
  "libc",
  "wasi",
+ "wasm-bindgen",
 ]
 
 [[package]]
@@ -2358,9 +2376,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.15.1"
+version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
 dependencies = [
  "allocator-api2",
  "equivalent",
@@ -2368,6 +2386,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2406,9 +2430,9 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "1.1.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
+checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea"
 dependencies = [
  "bytes",
  "fnv",
@@ -2452,9 +2476,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 
 [[package]]
 name = "hyper"
-version = "1.5.0"
+version = "1.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a"
+checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f"
 dependencies = [
  "bytes",
  "futures-channel",
@@ -2645,7 +2669,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2686,13 +2710,13 @@ dependencies = [
 
 [[package]]
 name = "impl-trait-for-tuples"
-version = "0.2.2"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11d7a9f6330b71fea57921c9b61c47ee6e84f72d394754eff6163ae67e7395eb"
+checksum = "a0eb5a3343abf848c0984fe4604b2b105da9539376e24fc0a3b0007411ae4fd9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2714,35 +2738,26 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.6.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
+checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f"
 dependencies = [
  "equivalent",
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
  "serde",
 ]
 
 [[package]]
 name = "indicatif"
-version = "0.17.8"
+version = "0.17.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
+checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
 dependencies = [
  "console",
- "instant",
  "number_prefix",
  "portable-atomic",
- "unicode-width",
-]
-
-[[package]]
-name = "instant"
-version = "0.1.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
-dependencies = [
- "cfg-if",
+ "unicode-width 0.2.0",
+ "web-time",
 ]
 
 [[package]]
@@ -2818,9 +2833,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.11"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
+checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
 
 [[package]]
 name = "jobserver"
@@ -2833,10 +2848,11 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.72"
+version = "0.3.74"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9"
+checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705"
 dependencies = [
+ "once_cell",
  "wasm-bindgen",
 ]
 
@@ -2887,7 +2903,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b"
 dependencies = [
  "cfg-if",
- "ecdsa 0.16.9 (registry+https://github.com/rust-lang/crates.io-index)",
+ "ecdsa",
  "elliptic-curve",
  "once_cell",
  "sha2 0.10.8",
@@ -2937,9 +2953,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.161"
+version = "0.2.167"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
+checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
 
 [[package]]
 name = "libgit2-sys"
@@ -2955,9 +2971,9 @@ dependencies = [
 
 [[package]]
 name = "libloading"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
+checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
 dependencies = [
  "cfg-if",
  "windows-targets 0.52.6",
@@ -2999,9 +3015,9 @@ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "litemap"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
+checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
 
 [[package]]
 name = "lock_api"
@@ -3025,7 +3041,7 @@ version = "0.12.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
 dependencies = [
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
 ]
 
 [[package]]
@@ -3078,11 +3094,10 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "1.0.2"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
- "hermit-abi",
  "libc",
  "wasi",
  "windows-sys 0.52.0",
@@ -3237,7 +3252,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3344,7 +3359,7 @@ checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3398,7 +3413,7 @@ checksum = "21aad1fbf80d2bcd7406880efc7ba109365f44bbb72896758ddcbfa46bf1592c"
 dependencies = [
  "alloy-consensus 0.3.6",
  "alloy-eips 0.3.6",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-serde 0.3.6",
  "derive_more 1.0.0",
@@ -3414,7 +3429,7 @@ checksum = "e281fbfc2198b7c0c16457d6524f83d192662bc9f3df70f24c3038d4521616df"
 dependencies = [
  "alloy-eips 0.3.6",
  "alloy-network-primitives 0.3.6",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rpc-types-eth 0.3.6",
  "alloy-serde 0.3.6",
  "cfg-if",
@@ -3448,7 +3463,7 @@ version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
 dependencies = [
- "ecdsa 0.16.9 (registry+https://github.com/rust-lang/crates.io-index)",
+ "ecdsa",
  "elliptic-curve",
  "primeorder",
  "sha2 0.10.8",
@@ -3457,7 +3472,7 @@ dependencies = [
 [[package]]
 name = "p3-air"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -3466,7 +3481,7 @@ dependencies = [
 [[package]]
 name = "p3-baby-bear"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "num-bigint 0.4.6",
  "p3-field",
@@ -3480,7 +3495,7 @@ dependencies = [
 [[package]]
 name = "p3-bn254-fr"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "ff 0.13.0",
  "num-bigint 0.4.6",
@@ -3494,7 +3509,7 @@ dependencies = [
 [[package]]
 name = "p3-challenger"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-maybe-rayon",
@@ -3507,7 +3522,7 @@ dependencies = [
 [[package]]
 name = "p3-commit"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-challenger",
@@ -3520,7 +3535,7 @@ dependencies = [
 [[package]]
 name = "p3-dft"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -3532,7 +3547,7 @@ dependencies = [
 [[package]]
 name = "p3-field"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "num-bigint 0.4.6",
@@ -3545,7 +3560,7 @@ dependencies = [
 [[package]]
 name = "p3-fri"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-challenger",
@@ -3563,7 +3578,7 @@ dependencies = [
 [[package]]
 name = "p3-interpolation"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-field",
  "p3-matrix",
@@ -3573,7 +3588,7 @@ dependencies = [
 [[package]]
 name = "p3-keccak-air"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "p3-air",
  "p3-field",
@@ -3586,7 +3601,7 @@ dependencies = [
 [[package]]
 name = "p3-matrix"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-field",
@@ -3600,7 +3615,7 @@ dependencies = [
 [[package]]
 name = "p3-maybe-rayon"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "rayon",
 ]
@@ -3608,7 +3623,7 @@ dependencies = [
 [[package]]
 name = "p3-mds"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-dft",
@@ -3622,7 +3637,7 @@ dependencies = [
 [[package]]
 name = "p3-merkle-tree"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-commit",
@@ -3638,7 +3653,7 @@ dependencies = [
 [[package]]
 name = "p3-poseidon2"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "gcd",
  "p3-field",
@@ -3651,7 +3666,7 @@ dependencies = [
 [[package]]
 name = "p3-symmetric"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-field",
@@ -3661,7 +3676,7 @@ dependencies = [
 [[package]]
 name = "p3-uni-stark"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "itertools 0.12.1",
  "p3-air",
@@ -3679,7 +3694,7 @@ dependencies = [
 [[package]]
 name = "p3-util"
 version = "0.1.0"
-source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#db3d45d4ec899efaf8f7234a8573f285fbdda5db"
+source = "git+https://github.com/Plonky3/Plonky3?branch=sp1-v4#bba88386261c3eaceb7f922b99bea56c1d6c6c58"
 dependencies = [
  "serde",
 ]
@@ -3791,9 +3806,10 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 name = "patch-testing-program"
 version = "1.1.0"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "curve25519-dalek",
  "curve25519-dalek-ng 4.1.1 (git+https://github.com/sp1-patches/curve25519-dalek-ng?tag=curve25519_dalek_ng-v4.1.1-patch-v1)",
+ "ecdsa",
  "ed25519-consensus",
  "ed25519-dalek",
  "k256",
@@ -3815,6 +3831,12 @@ dependencies = [
  "sp1-sdk",
 ]
 
+[[package]]
+name = "pathdiff"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
+
 [[package]]
 name = "pem-rfc7468"
 version = "0.3.1"
@@ -3846,7 +3868,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442"
 dependencies = [
  "memchr",
- "thiserror",
+ "thiserror 1.0.69",
  "ucd-trie",
 ]
 
@@ -3913,9 +3935,9 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
 
 [[package]]
 name = "portable-atomic"
-version = "1.9.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2"
+checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
 
 [[package]]
 name = "powerfmt"
@@ -3939,7 +3961,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033"
 dependencies = [
  "proc-macro2",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4000,14 +4022,14 @@ dependencies = [
  "proc-macro-error-attr2",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.89"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
@@ -4062,7 +4084,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4075,7 +4097,7 @@ dependencies = [
  "itertools 0.13.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4095,37 +4117,40 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
 
 [[package]]
 name = "quinn"
-version = "0.11.5"
+version = "0.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684"
+checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef"
 dependencies = [
  "bytes",
  "pin-project-lite",
  "quinn-proto",
  "quinn-udp",
- "rustc-hash 2.0.0",
+ "rustc-hash 2.1.0",
  "rustls",
  "socket2",
- "thiserror",
+ "thiserror 2.0.4",
  "tokio",
  "tracing",
 ]
 
 [[package]]
 name = "quinn-proto"
-version = "0.11.8"
+version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6"
+checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d"
 dependencies = [
  "bytes",
+ "getrandom",
  "rand 0.8.5",
  "ring",
- "rustc-hash 2.0.0",
+ "rustc-hash 2.1.0",
  "rustls",
+ "rustls-pki-types",
  "slab",
- "thiserror",
+ "thiserror 2.0.4",
  "tinyvec",
  "tracing",
+ "web-time",
 ]
 
 [[package]]
@@ -4289,7 +4314,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
 dependencies = [
  "getrandom",
  "libredox",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4300,7 +4325,7 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata 0.4.8",
+ "regex-automata 0.4.9",
  "regex-syntax 0.8.5",
 ]
 
@@ -4315,9 +4340,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -4382,7 +4407,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_urlencoded",
- "sync_wrapper 1.0.1",
+ "sync_wrapper 1.0.2",
  "tokio",
  "tokio-rustls",
  "tokio-util",
@@ -4407,7 +4432,7 @@ dependencies = [
  "http",
  "reqwest",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tower-service",
 ]
 
@@ -4420,7 +4445,7 @@ dependencies = [
  "reth-execution-errors",
  "reth-primitives",
  "reth-storage-errors",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4431,7 +4456,7 @@ dependencies = [
  "alloy-chains",
  "alloy-eips 0.3.6",
  "alloy-genesis",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-trie",
  "auto_impl",
  "derive_more 1.0.0",
@@ -4453,7 +4478,7 @@ dependencies = [
  "alloy-consensus 0.3.6",
  "alloy-eips 0.3.6",
  "alloy-genesis",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-trie",
  "bytes",
  "modular-bitfield",
@@ -4469,7 +4494,7 @@ dependencies = [
  "convert_case 0.6.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4514,7 +4539,7 @@ dependencies = [
  "reth-execution-errors",
  "reth-fs-util",
  "reth-storage-errors",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4535,13 +4560,13 @@ version = "1.0.6"
 source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374182a43a3602aaa953d37aa9217b"
 dependencies = [
  "alloy-chains",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "auto_impl",
  "crc",
  "dyn-clone",
  "once_cell",
- "rustc-hash 2.0.0",
+ "rustc-hash 2.1.0",
  "serde",
  "thiserror-no-std",
 ]
@@ -4598,7 +4623,7 @@ dependencies = [
  "reth-revm",
  "revm",
  "revm-primitives",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
 ]
 
@@ -4608,7 +4633,7 @@ version = "1.0.6"
 source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374182a43a3602aaa953d37aa9217b"
 dependencies = [
  "alloy-eips 0.3.6",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "derive_more 1.0.0",
  "nybbles",
@@ -4637,7 +4662,7 @@ source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374
 dependencies = [
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4645,11 +4670,11 @@ name = "reth-network-peers"
 version = "1.0.6"
 source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374182a43a3602aaa953d37aa9217b"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "enr",
  "serde_with",
- "thiserror",
+ "thiserror 1.0.69",
  "url",
 ]
 
@@ -4659,7 +4684,7 @@ version = "1.0.6"
 source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374182a43a3602aaa953d37aa9217b"
 dependencies = [
  "alloy-chains",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "derive_more 1.0.0",
  "once_cell",
  "reth-chainspec",
@@ -4688,7 +4713,7 @@ dependencies = [
  "alloy-consensus 0.3.6",
  "alloy-eips 0.3.6",
  "alloy-genesis",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-rpc-types",
  "alloy-serde 0.3.6",
@@ -4706,7 +4731,7 @@ dependencies = [
  "reth-trie-common",
  "revm-primitives",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4717,7 +4742,7 @@ dependencies = [
  "alloy-consensus 0.3.6",
  "alloy-eips 0.3.6",
  "alloy-genesis",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-rpc-types-eth 0.3.6",
  "byteorder",
@@ -4735,13 +4760,13 @@ name = "reth-prune-types"
 version = "1.0.6"
 source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374182a43a3602aaa953d37aa9217b"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "bytes",
  "derive_more 1.0.0",
  "modular-bitfield",
  "reth-codecs",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4764,7 +4789,7 @@ name = "reth-stages-types"
 version = "1.0.6"
 source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374182a43a3602aaa953d37aa9217b"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "bytes",
  "modular-bitfield",
  "reth-codecs",
@@ -4777,7 +4802,7 @@ name = "reth-static-file-types"
 version = "1.0.6"
 source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374182a43a3602aaa953d37aa9217b"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "derive_more 1.0.0",
  "serde",
  "strum",
@@ -4836,7 +4861,7 @@ source = "git+https://github.com/sp1-patches/reth?tag=rsp-20240830#260c7ed2c9374
 dependencies = [
  "alloy-consensus 0.3.6",
  "alloy-genesis",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-trie",
  "bytes",
@@ -4901,7 +4926,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e7a6bff9dbde3370a5ac9555104117f7e6039b3cc76e8d5d9d01899088beca2a"
 dependencies = [
  "alloy-eips 0.3.6",
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "auto_impl",
  "bitflags",
  "bitvec",
@@ -4918,8 +4943,7 @@ dependencies = [
 [[package]]
 name = "rfc6979"
 version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
+source = "git+https://github.com/sp1-patches/signatures?branch=ecdsa-v0.16.9-patch-v4.0.0-rc.2#d7e485412a2ac4974c63314ac0bce98473630859"
 dependencies = [
  "hmac",
  "subtle",
@@ -4961,9 +4985,9 @@ dependencies = [
 
 [[package]]
 name = "roaring"
-version = "0.10.6"
+version = "0.10.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1"
+checksum = "f81dc953b2244ddd5e7860cb0bb2a790494b898ef321d4aff8e260efab60cc88"
 dependencies = [
  "bytemuck",
  "byteorder",
@@ -5002,9 +5026,9 @@ dependencies = [
 
 [[package]]
 name = "rsa"
-version = "0.9.6"
+version = "0.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
+checksum = "47c75d7c5c6b673e58bf54d8544a9f432e3a925b0e80f7cd3602ab5c50c55519"
 dependencies = [
  "const-oid 0.9.6",
  "digest 0.10.7",
@@ -5024,7 +5048,7 @@ dependencies = [
 name = "rsa-program"
 version = "1.1.0"
 dependencies = [
- "rsa 0.9.6",
+ "rsa 0.9.7",
  "sha2 0.10.8",
  "sp1-zkvm",
 ]
@@ -5043,7 +5067,7 @@ name = "rsp-client-executor"
 version = "0.1.0"
 source = "git+https://github.com/succinctlabs/rsp/?rev=3647076#3647076da6580e30384dd911a3fc50d4bcdb5bc1"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "eyre",
  "futures",
@@ -5076,7 +5100,7 @@ name = "rsp-mpt"
 version = "0.1.0"
 source = "git+https://github.com/succinctlabs/rsp/?rev=3647076#3647076da6580e30384dd911a3fc50d4bcdb5bc1"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "alloy-rlp",
  "alloy-rpc-types",
  "anyhow",
@@ -5090,7 +5114,7 @@ dependencies = [
  "rlp",
  "rsp-primitives",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -5126,7 +5150,7 @@ dependencies = [
 name = "rsp-script"
 version = "0.1.0"
 dependencies = [
- "alloy-primitives 0.8.11",
+ "alloy-primitives 0.8.14",
  "bincode",
  "clap",
  "rsp-client-executor",
@@ -5190,9 +5214,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
 [[package]]
 name = "rustc-hash"
-version = "2.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152"
+checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497"
 dependencies = [
  "rand 0.8.5",
 ]
@@ -5223,9 +5247,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.39"
+version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "375116bee2be9ed569afe2154ea6a99dfdffd257f533f187498c2a8f5feaf4ee"
+checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
  "bitflags",
  "errno",
@@ -5236,9 +5260,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.23.16"
+version = "0.23.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
+checksum = "934b404430bb06b3fae2cba809eb45a1ab1aecd64491213d7c3301b88393f8d1"
 dependencies = [
  "once_cell",
  "ring",
@@ -5262,6 +5286,9 @@ name = "rustls-pki-types"
 version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
+dependencies = [
+ "web-time",
+]
 
 [[package]]
 name = "rustls-webpki"
@@ -5300,9 +5327,9 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 
 [[package]]
 name = "scale-info"
-version = "2.11.5"
+version = "2.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1aa7ffc1c0ef49b0452c6e2986abf2b07743320641ffd5fc63d552458e3b779b"
+checksum = "346a3b32eba2640d17a9cb5927056b08f3de90f65b72fe09402c2ad07d684d0b"
 dependencies = [
  "cfg-if",
  "derive_more 1.0.0",
@@ -5312,21 +5339,21 @@ dependencies = [
 
 [[package]]
 name = "scale-info-derive"
-version = "2.11.5"
+version = "2.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46385cc24172cf615450267463f937c10072516359b3ff1cb24228a4a08bf951"
+checksum = "c6630024bf739e2179b91fb424b28898baf819414262c5d376677dbff1fe7ebf"
 dependencies = [
  "proc-macro-crate 3.2.0",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "scc"
-version = "2.2.4"
+version = "2.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8d25269dd3a12467afe2e510f69fb0b46b698e5afb296b59f2145259deaf8e8"
+checksum = "66b202022bb57c049555430e11fc22fea12909276a80a4c3d368da36ac1d88ed"
 dependencies = [
  "sdd",
 ]
@@ -5359,11 +5386,11 @@ dependencies = [
 
 [[package]]
 name = "secp256k1"
-version = "0.29.0"
-source = "git+https://github.com/sp1-patches/rust-secp256k1?tag=secp256k1-v0.29.0-patch-v1#c78195abe3c5bc11163d69588a5559ef21bdff31"
+version = "0.29.1"
+source = "git+https://github.com/sp1-patches/rust-secp256k1?tag=patch-v0.29.1-v4.0.0#f947f69d8f30af6d548c2b64bff7376f951c417a"
 dependencies = [
  "cfg-if",
- "ecdsa 0.16.9 (git+https://github.com/sp1-patches/signatures?branch=patch-ecdsa-v0.16.9)",
+ "ecdsa",
  "elliptic-curve",
  "k256",
  "rand 0.8.5",
@@ -5373,7 +5400,7 @@ dependencies = [
 [[package]]
 name = "secp256k1-sys"
 version = "0.10.0"
-source = "git+https://github.com/sp1-patches/rust-secp256k1?tag=secp256k1-v0.29.0-patch-v1#c78195abe3c5bc11163d69588a5559ef21bdff31"
+source = "git+https://github.com/sp1-patches/rust-secp256k1?tag=patch-v0.29.1-v4.0.0#f947f69d8f30af6d548c2b64bff7376f951c417a"
 dependencies = [
  "cc",
 ]
@@ -5398,18 +5425,18 @@ dependencies = [
 
 [[package]]
 name = "semver-parser"
-version = "0.10.2"
+version = "0.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
+checksum = "9900206b54a3527fdc7b8a938bffd94a568bac4f4aa8113b209df75a09c0dec2"
 dependencies = [
  "pest",
 ]
 
 [[package]]
 name = "serde"
-version = "1.0.214"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5"
+checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
 dependencies = [
  "serde_derive",
 ]
@@ -5435,22 +5462,22 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.214"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766"
+checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.132"
+version = "1.0.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
+checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
  "itoa",
  "memchr",
  "ryu",
@@ -5475,7 +5502,16 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
+]
+
+[[package]]
+name = "serde_spanned"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1"
+dependencies = [
+ "serde",
 ]
 
 [[package]]
@@ -5500,7 +5536,7 @@ dependencies = [
  "chrono",
  "hex",
  "indexmap 1.9.3",
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
  "serde",
  "serde_derive",
  "serde_json",
@@ -5517,14 +5553,14 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "serial_test"
-version = "3.1.1"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b4b487fe2acf240a021cf57c6b2b4903b1e78ca0ecd862a71b71d2a51fed77d"
+checksum = "1b258109f244e1d6891bf1053a55d63a5cd4f8f4c30cf9a1280989f80e7a1fa9"
 dependencies = [
  "futures",
  "log",
@@ -5536,13 +5572,13 @@ dependencies = [
 
 [[package]]
 name = "serial_test_derive"
-version = "3.1.1"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82fe9db325bcef1fbcde82e078a5cc4efdf787e96b3b9cf45b50b529f2083d67"
+checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5669,9 +5705,9 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.5.7"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
+checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -5679,7 +5715,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-build"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "anyhow",
  "cargo_metadata",
@@ -5690,7 +5726,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-core-executor"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "bytemuck",
@@ -5713,7 +5749,7 @@ dependencies = [
  "sp1-stark",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "tiny-keccak",
  "tracing",
  "typenum",
@@ -5722,12 +5758,15 @@ dependencies = [
 
 [[package]]
 name = "sp1-core-machine"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
+ "cbindgen",
+ "cc",
  "cfg-if",
  "elliptic-curve",
- "generic-array 1.1.0",
+ "generic-array 1.1.1",
+ "glob",
  "hashbrown 0.14.5",
  "hex",
  "itertools 0.13.0",
@@ -5745,7 +5784,10 @@ dependencies = [
  "p3-maybe-rayon",
  "p3-uni-stark",
  "p3-util",
+ "pathdiff",
  "rand 0.8.5",
+ "rayon",
+ "rayon-scan",
  "serde",
  "size",
  "snowbridge-amcl",
@@ -5758,7 +5800,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
  "tracing-forest",
  "tracing-subscriber",
@@ -5768,7 +5810,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-cuda"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "ctrlc",
@@ -5783,13 +5825,13 @@ dependencies = [
 
 [[package]]
 name = "sp1-curves"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "cfg-if",
- "curve25519-dalek",
  "dashu",
+ "ecdsa",
  "elliptic-curve",
- "generic-array 1.1.0",
+ "generic-array 1.1.1",
  "itertools 0.13.0",
  "k256",
  "num",
@@ -5804,7 +5846,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-derive"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "quote",
  "syn 1.0.109",
@@ -5826,7 +5868,9 @@ dependencies = [
 
 [[package]]
 name = "sp1-lib"
-version = "3.0.0"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a5729da1b05d56c01457e5ecabdc77f1cc941df23f2921163a2f325aec22428"
 dependencies = [
  "bincode",
  "serde",
@@ -5834,9 +5878,16 @@ dependencies = [
 
 [[package]]
 name = "sp1-lib"
-version = "3.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14deb700469a37ec075bcf88dac3815b026dd9c4b9cb175980826f1fbb2e4e80"
+version = "4.0.0-rc.1"
+source = "git+https://github.com/succinctlabs/sp1.git?branch=v4.0.0-rc.2#e1c2fb381e96cb1eb9d21eee28fd5d888dd225f3"
+dependencies = [
+ "bincode",
+ "serde",
+]
+
+[[package]]
+name = "sp1-lib"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "serde",
@@ -5844,7 +5895,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-primitives"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "bincode",
  "hex",
@@ -5860,7 +5911,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-prover"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "anyhow",
  "bincode",
@@ -5888,14 +5939,15 @@ dependencies = [
  "sp1-recursion-core",
  "sp1-recursion-gnark-ffi",
  "sp1-stark",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
+ "tracing-appender",
  "tracing-subscriber",
 ]
 
 [[package]]
 name = "sp1-recursion-circuit"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "hashbrown 0.14.5",
  "itertools 0.13.0",
@@ -5927,7 +5979,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-compiler"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "backtrace",
  "itertools 0.13.0",
@@ -5947,12 +5999,16 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-core"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "backtrace",
+ "cbindgen",
+ "cc",
  "ff 0.13.0",
+ "glob",
  "hashbrown 0.14.5",
  "itertools 0.13.0",
+ "num_cpus",
  "p3-air",
  "p3-baby-bear",
  "p3-bn254-fr",
@@ -5967,13 +6023,14 @@ dependencies = [
  "p3-poseidon2",
  "p3-symmetric",
  "p3-util",
+ "pathdiff",
  "serde",
  "sp1-core-machine",
  "sp1-derive",
  "sp1-primitives",
  "sp1-stark",
  "static_assertions",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
  "vec_map",
  "zkhash",
@@ -5981,7 +6038,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-derive"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "quote",
  "syn 1.0.109",
@@ -5989,7 +6046,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-recursion-gnark-ffi"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "anyhow",
  "bincode",
@@ -6013,7 +6070,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-sdk"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "alloy-signer",
  "alloy-signer-local",
@@ -6046,7 +6103,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tracing",
  "twirp-rs",
@@ -6055,11 +6112,12 @@ dependencies = [
 
 [[package]]
 name = "sp1-stark"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "arrayref",
  "hashbrown 0.14.5",
  "itertools 0.13.0",
+ "num-bigint 0.4.6",
  "num-traits",
  "p3-air",
  "p3-baby-bear",
@@ -6087,7 +6145,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-verifier"
-version = "3.0.0"
+version = "4.0.0-rc.2"
 dependencies = [
  "hex",
  "lazy_static",
@@ -6098,7 +6156,7 @@ dependencies = [
 
 [[package]]
 name = "sp1-zkvm"
-version = "3.0.1"
+version = "4.0.0-rc.2"
 dependencies = [
  "cfg-if",
  "getrandom",
@@ -6108,7 +6166,7 @@ dependencies = [
  "p3-field",
  "rand 0.8.5",
  "sha2 0.10.8",
- "sp1-lib 3.0.0",
+ "sp1-lib 4.0.0-rc.2",
  "sp1-primitives",
 ]
 
@@ -6236,11 +6294,11 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6255,7 +6313,7 @@ dependencies = [
  "lazy_static",
  "rand 0.8.5",
  "rustc-hex",
- "sp1-lib 3.1.0",
+ "sp1-lib 3.4.0",
 ]
 
 [[package]]
@@ -6272,7 +6330,7 @@ dependencies = [
  "num-bigint 0.4.6",
  "rand 0.8.5",
  "rustc-hex",
- "sp1-lib 3.1.0",
+ "sp1-lib 3.4.0",
 ]
 
 [[package]]
@@ -6309,9 +6367,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.87"
+version = "2.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
+checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6320,14 +6378,14 @@ dependencies = [
 
 [[package]]
 name = "syn-solidity"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edf42e81491fb8871b74df3d222c64ae8cbc1269ea509fa768a3ed3e1b0ac8cb"
+checksum = "da0523f59468a2696391f2a772edc089342aacd53c3caa2ac3264e598edf119b"
 dependencies = [
  "paste",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6338,9 +6396,9 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
 [[package]]
 name = "sync_wrapper"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
 dependencies = [
  "futures-core",
 ]
@@ -6365,7 +6423,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6391,9 +6449,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
 [[package]]
 name = "tempfile"
-version = "3.13.0"
+version = "3.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b"
+checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
 dependencies = [
  "cfg-if",
  "fastrand",
@@ -6484,22 +6542,42 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.68"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02dd99dc800bbb97186339685293e1cc5d9df1f8fae2d0aecd9ff1c77efea892"
+checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490"
 dependencies = [
- "thiserror-impl",
+ "thiserror-impl 2.0.4",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.68"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7c61ec9a6f64d2793d8a45faba21efbe3ced62a886d44c36a009b2b519b4c7e"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6543,9 +6621,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21"
 dependencies = [
  "deranged",
  "itoa",
@@ -6566,9 +6644,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de"
 dependencies = [
  "num-conv",
  "time-core",
@@ -6610,9 +6688,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.41.0"
+version = "1.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb"
+checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551"
 dependencies = [
  "backtrace",
  "bytes",
@@ -6634,25 +6712,24 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "tokio-rustls"
-version = "0.26.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
+checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37"
 dependencies = [
  "rustls",
- "rustls-pki-types",
  "tokio",
 ]
 
 [[package]]
 name = "tokio-util"
-version = "0.7.12"
+version = "0.7.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a"
+checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078"
 dependencies = [
  "bytes",
  "futures-core",
@@ -6661,11 +6738,26 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "toml"
+version = "0.8.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit 0.22.22",
+]
+
 [[package]]
 name = "toml_datetime"
 version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
+dependencies = [
+ "serde",
+]
 
 [[package]]
 name = "toml_edit"
@@ -6673,7 +6765,7 @@ version = "0.19.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
  "toml_datetime",
  "winnow 0.5.40",
 ]
@@ -6684,7 +6776,9 @@ version = "0.22.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.0",
+ "serde",
+ "serde_spanned",
  "toml_datetime",
  "winnow 0.6.20",
 ]
@@ -6719,9 +6813,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
  "log",
  "pin-project-lite",
@@ -6729,22 +6823,34 @@ dependencies = [
  "tracing-core",
 ]
 
+[[package]]
+name = "tracing-appender"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
+dependencies = [
+ "crossbeam-channel",
+ "thiserror 1.0.69",
+ "time",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6758,7 +6864,7 @@ checksum = "ee40835db14ddd1e3ba414292272eddde9dad04d3d4b65509656414d1c42592f"
 dependencies = [
  "ansi_term",
  "smallvec",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
  "tracing-subscriber",
 ]
@@ -6776,9 +6882,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.18"
+version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
  "matchers",
  "nu-ansi-term",
@@ -6814,7 +6920,7 @@ dependencies = [
  "reqwest",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tower",
  "url",
@@ -6852,9 +6958,9 @@ checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.13"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "unicode-segmentation"
@@ -6868,6 +6974,12 @@ version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
 
+[[package]]
+name = "unicode-width"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
+
 [[package]]
 name = "unicode-xid"
 version = "0.2.6"
@@ -6882,9 +6994,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "url"
-version = "2.5.3"
+version = "2.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada"
+checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
 dependencies = [
  "form_urlencoded",
  "idna",
@@ -6975,9 +7087,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e"
+checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c"
 dependencies = [
  "cfg-if",
  "once_cell",
@@ -6986,36 +7098,37 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358"
+checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd"
 dependencies = [
  "bumpalo",
  "log",
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.45"
+version = "0.4.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b"
+checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d"
 dependencies = [
  "cfg-if",
  "js-sys",
+ "once_cell",
  "wasm-bindgen",
  "web-sys",
 ]
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56"
+checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -7023,22 +7136,22 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68"
+checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.95"
+version = "0.2.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d"
+checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49"
 
 [[package]]
 name = "wasm-streams"
@@ -7055,9 +7168,9 @@ dependencies = [
 
 [[package]]
 name = "web-sys"
-version = "0.3.72"
+version = "0.3.74"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112"
+checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -7075,9 +7188,9 @@ dependencies = [
 
 [[package]]
 name = "webpki-roots"
-version = "0.26.6"
+version = "0.26.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958"
+checksum = "5d642ff16b7e79272ae451b7322067cdc17cadf68c23264be9d94a32319efe7e"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -7342,9 +7455,9 @@ dependencies = [
 
 [[package]]
 name = "yoke"
-version = "0.7.4"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
+checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
 dependencies = [
  "serde",
  "stable_deref_trait",
@@ -7354,13 +7467,13 @@ dependencies = [
 
 [[package]]
 name = "yoke-derive"
-version = "0.7.4"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
+checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "synstructure 0.13.1",
 ]
 
@@ -7382,27 +7495,27 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "zerofrom"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
+checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
 dependencies = [
  "zerofrom-derive",
 ]
 
 [[package]]
 name = "zerofrom-derive"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
+checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
  "synstructure 0.13.1",
 ]
 
@@ -7423,7 +7536,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -7445,7 +7558,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.87",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -7475,11 +7588,6 @@ dependencies = [
  "subtle",
 ]
 
-[[patch.unused]]
-name = "ecdsa"
-version = "0.16.8"
-source = "git+https://github.com/sp1-patches/signatures?branch=umadayal/secp256r1#49b6288468aff7f88f0be8cfd3719c7c20b2ba47"
-
 [[patch.unused]]
 name = "sha2"
 version = "0.10.6"
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
index d10ee9741e..143fe26f0c 100644
--- a/examples/Cargo.toml
+++ b/examples/Cargo.toml
@@ -66,11 +66,11 @@ serde_json = "1.0.132"
 tracing = "0.1.40"
 
 [patch.crates-io]
-curve25519-dalek = { git = "https://github.com/sp1-patches/curve25519-dalek", tag = "curve25519_dalek-v4.1.3-patch-v1" }
+curve25519-dalek = { git = "https://github.com/sp1-patches/curve25519-dalek", tag = "patch-v4.1.3-v3.4.0" }
 curve25519-dalek-ng = { git = "https://github.com/sp1-patches/curve25519-dalek-ng", tag = "curve25519_dalek_ng-v4.1.1-patch-v1" } 
-ecdsa-core = { git = "https://github.com/sp1-patches/signatures", package = "ecdsa", branch = "umadayal/secp256r1" }
+ecdsa-core = { git = "https://github.com/sp1-patches/signatures", package = "ecdsa", branch = "ecdsa-v0.16.9-patch-v4.0.0-rc.2" }
 ed25519-consensus = { git = "https://github.com/sp1-patches/ed25519-consensus", tag = "ed25519_consensus-v2.1.0-patch-v1" }
-secp256k1 = { git = "https://github.com/sp1-patches/rust-secp256k1", tag = "secp256k1-v0.29.0-patch-v1" }
+secp256k1 = { git = "https://github.com/sp1-patches/rust-secp256k1", tag = "patch-v0.29.1-v4.0.0" }
 sha2-v0-10-8 = { git = "https://github.com/sp1-patches/RustCrypto-hashes", package = "sha2", tag = "sha2-v0.10.8-patch-v1" }
 sha2-v0-10-6 = { git = "https://github.com/sp1-patches/RustCrypto-hashes", package = "sha2", tag = "sha2-v0.10.6-patch-v1" }
 sha2-v0-9-9 = { git = "https://github.com/sp1-patches/RustCrypto-hashes", package = "sha2", tag = "sha2-v0.9.9-patch-v1" }
diff --git a/examples/elf/riscv32im-succinct-zkvm-elf b/examples/elf/riscv32im-succinct-zkvm-elf
new file mode 100755
index 0000000000..d16b4f1942
Binary files /dev/null and b/examples/elf/riscv32im-succinct-zkvm-elf differ
diff --git a/examples/fibonacci/script/Cargo.toml b/examples/fibonacci/script/Cargo.toml
index 308d3eeede..e8b9e5d787 100644
--- a/examples/fibonacci/script/Cargo.toml
+++ b/examples/fibonacci/script/Cargo.toml
@@ -7,7 +7,7 @@ publish = false
 
 [dependencies]
 hex = "0.4.3"
-sp1-sdk = { workspace = true }
+sp1-sdk = { workspace = true, features = ["native-gnark"] }
 
 [build-dependencies]
 sp1-build = { workspace = true }
diff --git a/examples/fibonacci/script/src/main.rs b/examples/fibonacci/script/src/main.rs
index 4cfb02879c..9f400183fa 100644
--- a/examples/fibonacci/script/src/main.rs
+++ b/examples/fibonacci/script/src/main.rs
@@ -24,7 +24,7 @@ fn main() {
 
     // Generate the proof for the given program and input.
     let (pk, vk) = client.setup(ELF);
-    let mut proof = client.prove(&pk, stdin).run().unwrap();
+    let mut proof = client.prove(&pk, stdin).groth16().run().unwrap();
 
     println!("generated proof");
 
diff --git a/examples/patch-testing/program/Cargo.toml b/examples/patch-testing/program/Cargo.toml
index a9d4feff9e..f8b299e176 100644
--- a/examples/patch-testing/program/Cargo.toml
+++ b/examples/patch-testing/program/Cargo.toml
@@ -19,6 +19,7 @@ curve25519-dalek = { version = "4.1.3", default-features = false, features = ["a
 curve25519-dalek-ng = { version = "4.1", default-features = false, features = ["u32_backend", "alloc"] }
 k256 = { version = "0.13.3", default-features = false, features = ["ecdsa"] }
 p256 = { version = "0.13.2", default-features = false, features = ["ecdsa"] }
+ecdsa-core = { version = "0.16.9", package = "ecdsa" }
 alloy-primitives = { version = "0.8", features = ["k256"] }
 secp256k1 = { version = "0.29", features = ["recovery", "global-context"] }
 
diff --git a/examples/patch-testing/program/src/main.rs b/examples/patch-testing/program/src/main.rs
index fe6e0d36a0..3418ae7e3b 100644
--- a/examples/patch-testing/program/src/main.rs
+++ b/examples/patch-testing/program/src/main.rs
@@ -6,12 +6,17 @@ use alloy_primitives::{address, bytes, hex};
 use alloy_primitives::{B256, B512};
 use curve25519_dalek::edwards::CompressedEdwardsY as CompressedEdwardsY_dalek;
 use curve25519_dalek_ng::edwards::CompressedEdwardsY as CompressedEdwardsY_dalek_ng;
+use ecdsa_core::RecoveryId as ecdsaRecoveryId;
 use ed25519_consensus::{
     Signature as Ed25519ConsensusSignature, VerificationKey as Ed25519ConsensusVerificationKey,
 };
 use ed25519_dalek::{
     Signature as Ed25519DalekSignature, Verifier, VerifyingKey as Ed25519DalekVerifyingKey,
 };
+use p256::{
+    ecdsa::{Signature as P256Signature, SigningKey, VerifyingKey as P256VerifyingKey},
+    elliptic_curve::rand_core::OsRng,
+};
 
 use sha2_v0_10_6::{Digest as Digest_10_6, Sha256 as Sha256_10_6};
 // use sha2_v0_10_8::{Digest as Digest_10_8, Sha256 as Sha256_10_8};
@@ -81,15 +86,31 @@ fn test_curve25519_dalek_ng() {
 
 /// Emits ED_DECOMPRESS syscall.
 fn test_curve25519_dalek() {
-    let input = [1u8; 32];
-    let y = CompressedEdwardsY_dalek(input);
+    let input_passing = [1u8; 32];
+
+    // This y-coordinate is not square, and therefore not on the curve
+    let limbs: [u64; 4] =
+        [8083970408152925034, 11907700107021980321, 16259949789167878387, 5645861033211660086];
+
+    // convert to bytes
+    let input_failing: [u8; 32] =
+        limbs.iter().flat_map(|l| l.to_be_bytes()).collect::<Vec<u8>>().try_into().unwrap();
+
+    let y_passing = CompressedEdwardsY_dalek(input_passing);
 
     println!("cycle-tracker-start: curve25519-dalek decompress");
-    let decompressed_key = y.decompress().unwrap();
+    let decompressed_key = y_passing.decompress().unwrap();
     println!("cycle-tracker-end: curve25519-dalek decompress");
 
     let compressed_key = decompressed_key.compress();
-    assert_eq!(compressed_key, y);
+    assert_eq!(compressed_key, y_passing);
+
+    let y_failing = CompressedEdwardsY_dalek(input_failing);
+    println!("cycle-tracker-start: curve25519-dalek decompress");
+    let decompressed_key = y_failing.decompress();
+    println!("cycle-tracker-end: curve25519-dalek decompress");
+
+    assert!(decompressed_key.is_none());
 }
 
 /// Emits KECCAK_PERMUTE syscalls.
@@ -123,29 +144,34 @@ fn test_sha256() {
 }
 
 fn test_p256_patch() {
-    // A valid signature.
-    let precompile_input = bytes!("b5a77e7a90aa14e0bf5f337f06f597148676424fae26e175c6e5621c34351955289f319789da424845c9eac935245fcddd805950e2f02506d09be7e411199556d262144475b1fa46ad85250728c600c53dfd10f8b3f4adf140e27241aec3c2da3a81046703fccf468b48b145f939efdbb96c3786db712b3113bb2488ef286cdcef8afe82d200a5bb36b5462166e8ce77f2d831a52ef2135b2af188110beaefb1");
-    println!("cycle-tracker-start: p256 verify");
-    let result = revm_precompile::secp256r1::verify_impl(&precompile_input);
-    println!("cycle-tracker-end: p256 verify");
-
-    assert!(result.is_some());
-
-    let invalid_test_cases = vec![
-            bytes!("3cee90eb86eaa050036147a12d49004b6b9c72bd725d39d4785011fe190f0b4da73bd4903f0ce3b639bbbf6e8e80d16931ff4bcf5993d58468e8fb19086e8cac36dbcd03009df8c59286b162af3bd7fcc0450c9aa81be5d10d312af6c66b1d604aebd3099c618202fcfe16ae7770b0c49ab5eadf74b754204a3bb6060e44eff37618b065f9832de4ca6ca971a7a1adc826d0f7c00181a5fb2ddf79ae00b4e10e"),
-            bytes!("afec5769b5cf4e310a7d150508e82fb8e3eda1c2c94c61492d3bd8aea99e06c9e22466e928fdccef0de49e3503d2657d00494a00e764fd437bdafa05f5922b1fbbb77c6817ccf50748419477e843d5bac67e6a70e97dde5a57e0c983b777e1ad31a80482dadf89de6302b1988c82c29544c9c07bb910596158f6062517eb089a2f54c9a0f348752950094d3228d3b940258c75fe2a413cb70baa21dc2e352fc5"),
-            bytes!("f775723953ead4a90411a02908fd1a629db584bc600664c609061f221ef6bf7c440066c8626b49daaa7bf2bcc0b74be4f7a1e3dcf0e869f1542fe821498cbf2de73ad398194129f635de4424a07ca715838aefe8fe69d1a391cfa70470795a80dd056866e6e1125aff94413921880c437c9e2570a28ced7267c8beef7e9b2d8d1547d76dfcf4bee592f5fefe10ddfb6aeb0991c5b9dbbee6ec80d11b17c0eb1a"),
-            bytes!("4cee90eb86eaa050036147a12d49004b6a"),
-            bytes!("4cee90eb86eaa050036147a12d49004b6b9c72bd725d39d4785011fe190f0b4da73bd4903f0ce3b639bbbf6e8e80d16931ff4bcf5993d58468e8fb19086e8cac36dbcd03009df8c59286b162af3bd7fcc0450c9aa81be5d10d312af6c66b1d604aebd3099c618202fcfe16ae7770b0c49ab5eadf74b754204a3bb6060e44eff37618b065f9832de4ca6ca971a7a1adc826d0f7c00181a5fb2ddf79ae00b4e10e00"),
-            bytes!("b5a77e7a90aa14e0bf5f337f06f597148676424fae26e175c6e5621c34351955289f319789da424845c9eac935245fcddd805950e2f02506d09be7e411199556d262144475b1fa46ad85250728c600c53dfd10f8b3f4adf140e27241aec3c2daaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaef8afe82d200a5bb36b5462166e8ce77f2d831a52ef2135b2af188110beaefb1")
-    ];
-
-    for input in invalid_test_cases {
-        println!("cycle-tracker-start: p256 verify false");
-        let result = revm_precompile::secp256r1::verify_impl(&input);
-        println!("cycle-tracker-end: p256 verify false");
-        assert!(result.is_none());
-    }
+    let message = hex!("656432353531392d636f6e73656e7375732074657374206d657373616765");
+    let mut hasher = Sha256_10_6::new();
+    hasher.update(message);
+    let message_prehash = hasher.finalize();
+    println!("message_prehash: {:?}", message_prehash);
+
+    let signing_key = SigningKey::random(&mut OsRng);
+    let (mut signature, recid) = signing_key.sign_prehash_recoverable(&message_prehash).unwrap();
+    println!("signature: {:?}", signature);
+    println!("recid: {:?}", recid);
+
+    let mut recid_byte = recid.to_byte();
+
+    assert!(signature.normalize_s().is_some());
+
+    //if let Some(sig_normalized) = signature.normalize_s() {
+    //    signature = sig_normalized;
+    //    recid_byte ^= 1;
+    //}
+    //
+    //let recid = ecdsaRecoveryId::from_byte(recid_byte).unwrap();
+
+    println!("cycle-tracker-start: p256 recovery");
+    let recovered_key =
+        P256VerifyingKey::recover_from_prehash(&message_prehash, &signature, recid).unwrap();
+
+    assert_eq!(&recovered_key, signing_key.verifying_key());
+    println!("cycle-tracker-end: p256 recovery");
 }
 
 /// Emits SECP256K1_ADD, SECP256K1_DOUBLE, and SECP256K1_DECOMPRESS syscalls.
@@ -208,6 +234,7 @@ fn test_secp256k1_patch() {
     let serialized_key = public_key.serialize_uncompressed();
 
     let sig = Secp256k1Signature::from_compact(&hex!("80AEBD912F05D302BA8000A3C5D6E604333AAF34E22CC1BA14BE1737213EAED5040D67D6E9FA5FBDFE6E3457893839631B87A41D90508B7C92991ED7824E962D")).unwrap();
+    println!("secp256k1 verify_ecdsa");
     println!("cycle-tracker-start: secp256k1 verify_ecdsa");
     let result = secp.verify_ecdsa(&message, &sig, &public_key);
     println!("cycle-tracker-end: secp256k1 verify_ecdsa");
@@ -220,8 +247,8 @@ fn test_secp256k1_patch() {
 
 /// To add testing for a new patch, add a new case to the function below.
 pub fn main() {
-    // TODO: Specify which syscalls are linked to each function invocation, iterate
-    // over this list that is shared between the program and script.
+    //TODO: Specify which syscalls are linked to each function invocation, iterate
+    //over this list that is shared between the program and script.
     test_keccak();
     test_sha256();
 
@@ -232,6 +259,7 @@ pub fn main() {
     test_ed25519_consensus();
 
     test_k256_patch();
-    test_secp256k1_patch();
     test_p256_patch();
+
+    test_secp256k1_patch();
 }
diff --git a/rustfmt.toml b/rustfmt.toml
index 21efa865c0..68c3c93033 100644
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -1,11 +1,11 @@
 reorder_imports = true
-# imports_granularity = "Crate"
+imports_granularity = "Crate"
 use_small_heuristics = "Max"
-# comment_width = 100
-# wrap_comments = true
-# binop_separator = "Back"
-# trailing_comma = "Vertical"
-# trailing_semicolon = false
+comment_width = 100
+wrap_comments = true
+binop_separator = "Back"
+trailing_comma = "Vertical"
+trailing_semicolon = false
 use_field_init_shorthand = true
-# format_code_in_doc_comments = true
-# doc_comment_code_block_width = 100
+format_code_in_doc_comments = true
+doc_comment_code_block_width = 100