-
Notifications
You must be signed in to change notification settings - Fork 12
/
fast-llm.sbat
37 lines (33 loc) · 1.15 KB
/
fast-llm.sbat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash
#SBATCH --job-name=fast_llm_train
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=1
#SBATCH --exclusive
#SBATCH --output=job_output.log
#SBATCH --error=job_error.log
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=8001
echo $MASTER_ADDR
export NCCL_DEBUG=WARN
export NCCL_SOCKET_IFNAME=eno1
export UCX_TLS=self,shm,tcp
export NCCL_NET_GDR_LEVEL=PIX
export NCCL_IB_PCI_RELAXED_ORDERING=1
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export PYTHONHASHSEED=0
export TRITON_ALLOW_NON_CONSTEXPR_GLOBALS=1
srun --gpus-per-node=$SLURM_GPUS_PER_NODE \
--ntasks-per-node=$SLURM_NTASKS_PER_NODE \
bash -c "
torchrun --rdzv_backend=static \
--rdzv_id=0 \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--node_rank=\$SLURM_NODEID \
--nproc_per_node=\$SLURM_GPUS_PER_NODE \
--nnodes=\$SLURM_NNODES \
--max_restarts=0 \
--rdzv_conf=timeout=3600 \
--no_python \
fast-llm train gpt \
--config examples/mistral_4_node_benchmark.yaml"