support different network interface tests

exo-explore · Dec 17, 2024 · 023ddc2 · 023ddc2
1 parent 2f0b543
commit 023ddc2
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 14 deletions.
diff --git a/.github/workflows/bench_job.yml b/.github/workflows/bench_job.yml
@@ -13,6 +13,9 @@ on:
       calling_job_name:
         required: true
         type: string
+      network_interface:
+        required: true
+        type: string
 jobs:
   generate-matrix:
     runs-on: ubuntu-latest
@@ -122,7 +125,7 @@ jobs:
           sudo taskpolicy -d default -g default -a -t 0 -l 0 .venv/bin/exo \
             --node-id="${MY_NODE_ID}" \
             --node-id-filter="${ALL_NODE_IDS}" \
-            --interface-type-filter="Ethernet" \
+            --interface-type-filter="${{ inputs.network_interface }}" \
             --disable-tui \
             --max-generate-tokens 250 \
             --chatgpt-api-port 52415 > output1.log 2>&1 &

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -17,6 +17,7 @@ jobs:
       config: '{"M4PRO_GPU16_24GB": 1}'
       model: ${{ matrix.model }}
       calling_job_name: 'single-m4-pro'
+      network_interface: 'Ethernet'
     secrets: inherit
 
   two-m4-pro-cluster:
@@ -28,30 +29,43 @@ jobs:
       config: '{"M4PRO_GPU16_24GB": 2}'
       model: ${{ matrix.model }}
       calling_job_name: 'two-m4-pro-cluster'
+      network_interface: 'Ethernet'
     secrets: inherit
 
+  # two-m4-pro-cluster-thunderbolt:
+  #   strategy:
+  #     matrix:
+  #       model: ['llama-3.2-1b', 'llama-3.2-3b', 'llama-3.1-8b']
+  #   uses: ./.github/workflows/bench_job.yml
+  #   with:
+  #     config: '{"M4PRO_GPU16_24GB": 2}'
+  #     model: ${{ matrix.model }}
+  #     calling_job_name: 'two-m4-pro-cluster-thunderbolt'
+  #     network_interface: 'Thunderbolt'
+  #   secrets: inherit
+
   three-m4-pro-cluster:
     strategy:
       matrix:
         model: ['llama-3.2-1b', 'llama-3.2-3b', 'llama-3.1-8b', 'llama-3.3-70b']
-      # Optional: add fail-fast: false if you want all matrix jobs to continue even if one fails
       fail-fast: false
     uses: ./.github/workflows/bench_job.yml
     with:
       config: '{"M4PRO_GPU16_24GB": 3}'
       model: ${{ matrix.model }}
       calling_job_name: 'three-m4-pro-cluster'
+      network_interface: 'Ethernet'
     secrets: inherit
 
-  # test-m3-single-node:
-  #   strategy:
-  #     matrix:
-  #       model: ['llama-3.2-1b']
-  #     # Optional: add fail-fast: false if you want all matrix jobs to continue even if one fails
-  #     fail-fast: false
-  #   uses: ./.github/workflows/bench_job.yml
-  #   with:
-  #     config: '{"M3MAX_GPU40_128GB": 1}'
-  #     model: ${{ matrix.model }}
-  #     calling_job_name: 'test-m3-cluster'
-  #   secrets: inherit
+  test-m3-single-node:
+    strategy:
+      matrix:
+        model: ['llama-3.2-1b']
+      fail-fast: false
+    uses: ./.github/workflows/bench_job.yml
+    with:
+      config: '{"M3MAX_GPU40_128GB": 1}'
+      model: ${{ matrix.model }}
+      calling_job_name: 'test-m3-cluster'
+      network_interface: 'Ethernet'
+    secrets: inherit