feat: autoscale shard / batch size (#1435)

succinctlabs · Sep 4, 2024 · fbdaa88 · fbdaa88
1 parent 43461b1
commit fbdaa88
Show file tree

Hide file tree

Showing 11 changed files with 281 additions and 1,448 deletions.
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -269,3 +269,47 @@ jobs:
             --branch-name "${{ github.head_ref || github.ref_name }}" \
             --commit-hash "${{ github.sha }}" \
             --author "${{ github.event.pull_request.user.login || github.actor }}"
+
+  low-memory:
+    name: Low Memory
+    strategy: 
+      matrix:
+        mem_limit: [16, 32, 64]
+    runs-on:
+      [
+        runs-on,
+        "ram=${{ matrix.mem_limit}}",
+        family=c7a,
+        image=ubuntu22-full-x64,
+        "run-id=${{ github.run_id }}",
+      ]
+    env:
+      CARGO_NET_GIT_FETCH_WITH_CLI: "true"
+    steps:
+      - name: Checkout sources
+        uses: actions/checkout@v4
+
+      - name: Setup CI
+        uses: ./.github/actions/setup
+
+      - name: Install SP1 toolchain
+        run: |
+          curl -L https://sp1.succinct.xyz | bash
+          ~/.sp1/bin/sp1up 
+          ~/.sp1/bin/cargo-prove prove --version
+
+      - name: Install SP1 CLI
+        run: |
+          cd crates/cli
+          cargo install --force --locked --path .
+          cd ~
+
+      - name: Run tendermint script
+        run: |
+          cd examples/tendermint/program
+          cargo add sp1-zkvm --path $GITHUB_WORKSPACE/crates/zkvm/entrypoint
+          cargo prove build
+          cd ../script
+          cargo remove sp1-sdk
+          cargo add sp1-sdk --path $GITHUB_WORKSPACE/crates/sdk
+          SP1_DEV=1 RUST_LOG=info cargo run --release
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/stark/Cargo.toml b/crates/stark/Cargo.toml
@@ -38,6 +38,7 @@ tracing = "0.1.40"
 rayon-scan = "0.1.1"
 arrayref = "0.3.8"
 getrandom = { version = "0.2.15", features = ["custom"] }
+sysinfo = "0.15.1"
 
 
 [dev-dependencies]

diff --git a/crates/stark/src/opts.rs b/crates/stark/src/opts.rs
@@ -1,9 +1,10 @@
 use std::env;
 
 use serde::{Deserialize, Serialize};
+use sysinfo::{System, SystemExt};
 
-const DEFAULT_SHARD_SIZE: usize = 1 << 22;
-const DEFAULT_SHARD_BATCH_SIZE: usize = 16;
+const MAX_SHARD_SIZE: usize = 1 << 22;
+const MAX_SHARD_BATCH_SIZE: usize = 8;
 const DEFAULT_TRACE_GEN_WORKERS: usize = 1;
 const DEFAULT_CHECKPOINTS_CHANNEL_CAPACITY: usize = 128;
 const DEFAULT_RECORDS_AND_TRACES_CHANNEL_CAPACITY: usize = 1;
@@ -42,19 +43,55 @@ pub struct SP1CoreOpts {
     pub records_and_traces_channel_capacity: usize,
 }
 
+/// Calculate the default shard size using an empirically determined formula.
+///
+/// For super memory constrained machines, we need to set shard size to 2^18.
+/// Otherwise, we use a linear formula derived from experimental results.
+/// The data comes from benchmarking the maximum physical memory usage
+/// of [rsp](https://github.com/succinctlabs/rsp) on a variety of shard sizes and
+/// shard batch sizes, and performing linear regression on the results.
+#[allow(clippy::cast_precision_loss)]
+fn shard_size(total_available_mem: u64) -> usize {
+    let log_shard_size = match total_available_mem {
+        0..=14 => 18,
+        m => (((m as f64).log2() * 0.619) + 17.2).floor() as usize,
+    };
+    std::cmp::min(1 << log_shard_size, MAX_SHARD_SIZE)
+}
+
+/// Calculate the default shard batch size using an empirically determined formula.
+///
+/// For memory constrained machines, we need to set shard batch size to either 1 or 2.
+/// For machines with a very large amount of memory, we can use batch size 8. Empirically,
+/// going above 8 doesn't result in a significant speedup.
+/// For most machines, we can just use batch size 4.
+fn shard_batch_size(total_available_mem: u64) -> usize {
+    match total_available_mem {
+        0..=16 => 1,
+        17..=48 => 2,
+        256.. => MAX_SHARD_BATCH_SIZE,
+        _ => 4,
+    }
+}
+
 impl Default for SP1CoreOpts {
     fn default() -> Self {
         let split_threshold = env::var("SPLIT_THRESHOLD")
             .map(|s| s.parse::<usize>().unwrap_or(DEFERRED_SPLIT_THRESHOLD))
             .unwrap_or(DEFERRED_SPLIT_THRESHOLD);
+        let sys = System::new_all();
+        let total_available_mem = sys.get_total_memory() / (1024 * 1024);
+        let default_shard_size = shard_size(total_available_mem);
+        let default_shard_batch_size = shard_batch_size(total_available_mem);
+
         Self {
             shard_size: env::var("SHARD_SIZE").map_or_else(
-                |_| DEFAULT_SHARD_SIZE,
-                |s| s.parse::<usize>().unwrap_or(DEFAULT_SHARD_SIZE),
+                |_| default_shard_size,
+                |s| s.parse::<usize>().unwrap_or(default_shard_size),
             ),
             shard_batch_size: env::var("SHARD_BATCH_SIZE").map_or_else(
-                |_| DEFAULT_SHARD_BATCH_SIZE,
-                |s| s.parse::<usize>().unwrap_or(DEFAULT_SHARD_BATCH_SIZE),
+                |_| default_shard_batch_size,
+                |s| s.parse::<usize>().unwrap_or(default_shard_batch_size),
             ),
             split_opts: SplitOpts::new(split_threshold),
             reconstruct_commitments: true,
@@ -81,7 +118,9 @@ impl SP1CoreOpts {
     pub fn recursion() -> Self {
         let mut opts = Self::default();
         opts.reconstruct_commitments = false;
-        opts.shard_size = DEFAULT_SHARD_SIZE;
+
+        // Recursion only supports 1 << 22 shard size.
+        opts.shard_size = MAX_SHARD_SIZE;
         opts
     }
 }