-
Notifications
You must be signed in to change notification settings - Fork 0
/
frontera.horovod.slurm
51 lines (40 loc) · 1.67 KB
/
frontera.horovod.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash -l
#SBATCH -J SHMEM_ML # Job name
#SBATCH -p development # Queue (partition) name
#SBATCH -N 32 # Total # of nodes (must be 1 for serial)
#SBATCH --ntasks-per-node=56
#SBATCH -t 01:00:00 # Run time (hh:mm:ss)
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH [email protected]
#SBATCH --exclusive
#####SBATCH --contiguous
set -e
ulimit -c unlimited
source ~/.profile
echo "Running on:"
echo $SLURM_NODELIST
echo
echo "Running with OpenSHMEM installation at $SHMEM_HOME"
# 2 sockets x 28 cores per socket for Frontera
export CORES_PER_SOCKET=28
export SOCKETS_PER_NODE=2
export CORES_PER_NODE=$(($SOCKETS_PER_NODE * $CORES_PER_SOCKET))
# export SHMEM_SYMMETRIC_SIZE=$((2 * 1024 * 1024 * 1024 + 512 * 1024 * 1024))
# export SHMEM_ML_POOL_SIZE=$((1 * 1024 * 1024 * 1024 + 512 * 1024 * 1024))
export SHMEM_SYMMETRIC_SIZE=$((1024 * 1024 * 1024))
export SHMEM_ML_POOL_SIZE=$((512 * 1024 * 1024))
export SHMEM_ML_MAX_MAILBOX_BUFFERS=1
mkdir -p $SCRATCH/job.$SLURM_JOB_ID
cd $SCRATCH/job.$SLURM_JOB_ID
#export OSHRUN_DEBUG=y
export SLURM_ARGS="--ntasks=$(($SLURM_NNODES * $CORES_PER_NODE)) --ntasks-per-socket=$CORES_PER_SOCKET --cpus-per-task=1"
echo
# 0 = all messages are logged (default behavior)
# 1 = INFO messages are not printed
# 2 = INFO and WARNING messages are not printed
# 3 = INFO, WARNING, and ERROR messages are not printed
export TF_CPP_MIN_LOG_LEVEL=3
# oshrun -N 56 python $HOME/shmem_ml/example/keras_horovod.py
# horovodrun -np $(($SLURM_NNODES * $CORES_PER_NODE)) python $HOME/shmem_ml/example/keras_horovod.py
export IBRUN_TASKS_PER_NODE=56
ibrun python $HOME/shmem_ml/example/keras_horovod.py