diff --git a/.github/workflows/bench_job.yml b/.github/workflows/bench_job.yml index 6143f2395..842f2527e 100644 --- a/.github/workflows/bench_job.yml +++ b/.github/workflows/bench_job.yml @@ -13,6 +13,9 @@ on: calling_job_name: required: true type: string + network_interface: + required: true + type: string jobs: generate-matrix: runs-on: ubuntu-latest @@ -122,7 +125,7 @@ jobs: sudo taskpolicy -d default -g default -a -t 0 -l 0 .venv/bin/exo \ --node-id="${MY_NODE_ID}" \ --node-id-filter="${ALL_NODE_IDS}" \ - --interface-type-filter="Ethernet" \ + --interface-type-filter="${{ inputs.network_interface }}" \ --disable-tui \ --max-generate-tokens 250 \ --chatgpt-api-port 52415 > output1.log 2>&1 & diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 455a950da..ddd4ff8a7 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -17,6 +17,7 @@ jobs: config: '{"M4PRO_GPU16_24GB": 1}' model: ${{ matrix.model }} calling_job_name: 'single-m4-pro' + network_interface: 'Ethernet' secrets: inherit two-m4-pro-cluster: @@ -28,30 +29,43 @@ jobs: config: '{"M4PRO_GPU16_24GB": 2}' model: ${{ matrix.model }} calling_job_name: 'two-m4-pro-cluster' + network_interface: 'Ethernet' secrets: inherit + # two-m4-pro-cluster-thunderbolt: + # strategy: + # matrix: + # model: ['llama-3.2-1b', 'llama-3.2-3b', 'llama-3.1-8b'] + # uses: ./.github/workflows/bench_job.yml + # with: + # config: '{"M4PRO_GPU16_24GB": 2}' + # model: ${{ matrix.model }} + # calling_job_name: 'two-m4-pro-cluster-thunderbolt' + # network_interface: 'Thunderbolt' + # secrets: inherit + three-m4-pro-cluster: strategy: matrix: model: ['llama-3.2-1b', 'llama-3.2-3b', 'llama-3.1-8b', 'llama-3.3-70b'] - # Optional: add fail-fast: false if you want all matrix jobs to continue even if one fails fail-fast: false uses: ./.github/workflows/bench_job.yml with: config: '{"M4PRO_GPU16_24GB": 3}' model: ${{ matrix.model }} calling_job_name: 'three-m4-pro-cluster' + network_interface: 'Ethernet' secrets: inherit - # test-m3-single-node: - # strategy: - # matrix: - # model: ['llama-3.2-1b'] - # # Optional: add fail-fast: false if you want all matrix jobs to continue even if one fails - # fail-fast: false - # uses: ./.github/workflows/bench_job.yml - # with: - # config: '{"M3MAX_GPU40_128GB": 1}' - # model: ${{ matrix.model }} - # calling_job_name: 'test-m3-cluster' - # secrets: inherit \ No newline at end of file + test-m3-single-node: + strategy: + matrix: + model: ['llama-3.2-1b'] + fail-fast: false + uses: ./.github/workflows/bench_job.yml + with: + config: '{"M3MAX_GPU40_128GB": 1}' + model: ${{ matrix.model }} + calling_job_name: 'test-m3-cluster' + network_interface: 'Ethernet' + secrets: inherit \ No newline at end of file