forked from IBM/Grapher
-
Notifications
You must be signed in to change notification settings - Fork 1
/
.env
153 lines (133 loc) · 5.11 KB
/
.env
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
####################################################
# Primary parameters
####################################################
# Set the storage for the model outputs!
export STORAGE_DRIVE=$DATA
# Set the dataset name
# Currently, only the webnlg dataset is supported
export DATASET=webnlg
# 1 ... edge classifier head; 0 ... edge generation head
# set the model ID you want to train, test or use for generation
export MODEL_VARIANT=0
# -2 ... The training starts from scratch
# -1 ... The training resumes from or the test is executed for the last checkpoint of the last execution of model MODEL_VARIANT
# > 0 ... The training resumes from or the test is executed for a specific epoch checkpoint of the last execution of model MODEL_VARIANT
export CHECKPOINT_MODEL_ID=-1
# Set the hardware settings for the training, test or generation
# Available hardware settings that are defined below
# No SLURM; one node; two GPUs per node; Nvidia A40: 1_2_a40
# No SLURM; one node; two GPUs per node; Nvidia A100: 1_2_a100
# No SLURM; one node; two GPUs per node; Nvidia TI: 1_2_ti
# SLURM; two node; two GPUs per node; Nvidia A40: s2_2_a40
# SLURM; one node; two GPUs per node; Nvidia A40: s1_2_a40
# SLURM; one node; one GPU per node; Nvidia A40: s1_1_a40
export HARDWARE_SETTING=s1_1_a40
# Mail to which SLURM will send a mail upon job start
export [email protected]
####################################################
# Directories
####################################################
# For timestamped output directories
CURRENT_DATETIME=$(date +"%Y-%m-%dT%H:%M:%S")
# They should be changed with care
export ROOT_DIR="${STORAGE_DRIVE}/data/core/grapher/output"
if [[ $MODEL_VARIANT -eq 0 ]]; then
export EVAL_DIR="${ROOT_DIR}/${DATASET}_model_variant=gen"
elif [[ $MODEL_VARIANT -eq 1 ]]; then
export EVAL_DIR="${ROOT_DIR}/${DATASET}_model_variant=class"
else
echo "Non-existing model variant. Choose either 0 for edge generation head or 1 for edge classification head"
exit 1
fi
if [ ! -d "$EVAL_DIR" ]; then
mkdir -p "$EVAL_DIR"
fi
valid_dirs=()
for directory in "$EVAL_DIR"/*; do
timestamp=$(basename "$directory")
if date -d "$timestamp" "+%Y-%m-%dT%H:%M:%S" >/dev/null 2>&1; then
valid_dirs+=("$timestamp")
fi
done
if [ $CHECKPOINT_MODEL_ID -eq -2 ] || [ ${#valid_dirs[@]} -eq 0 ]; then
export EXEC_DIR="${EVAL_DIR}/${CURRENT_DATETIME}"
mkdir -p "${EXEC_DIR}/checkpoints" "${EXEC_DIR}/valid" "${EXEC_DIR}/test"
echo "Created new directory: ${EXEC_DIR} with three sub directories"
elif [ $CHECKPOINT_MODEL_ID -gt -2 ] && [ ${#valid_dirs[@]} -gt 0 ]; then
export EXEC_DIR="${EVAL_DIR}/${valid_dirs[-1]}"
else
echo "The CHECKPOINT_MODEL_ID must be either -2 (new training), -1 (resume training from last epoch checkpoint) or >= 0 (resume trianing from a specific epoch checkpoint)"
fi
export CACHE_DIR=${STORAGE_DRIVE}/data/core/cache/grapher
export DATA_DIR=${STORAGE_DRIVE}/data/core/webnlg-dataset/release_v3.0/en
####################################################
# Server, HPC, and GPU settings
####################################################
# Mapping for troubadix server on TU Wien
# CUDA 0 = GPU 4
# CUDA 1 = GPU 5
# CUDA 2 = GPU 0
# CUDA 3 = GPU 1
# CUDA 4 = GPU 2
# CUDA 5 = GPU 3
export CUDA_VISIBLE_DEVICES=0,1
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:1024
export TIME=2-0
# Troubadix: Interactive terminal (implies one node)
# --------------------------------------------------
if [[ $HARDWARE_SETTING == "1_2_ti" ]]; then
export BATCH_SIZE=8
export NUM_NODES=1
export NUM_GPUS_PER_NODE=2
export NUM_DATAWORKERS=15
# set GPUs in .env
# VSC5: Interactive terminal (implies one node)
# --------------------------------------------------
elif [[ $HARDWARE_SETTING == "1_2_a40" ]]; then
export BATCH_SIZE=20
export NUM_NODES=1
export NUM_GPUS_PER_NODE=2
export NUM_DATAWORKERS=2
elif [[ $HARDWARE_SETTING == "1_1_a40" ]]; then
export CUDA_VISIBLE_DEVICES=0
export BATCH_SIZE=20
export NUM_NODES=1
export NUM_GPUS_PER_NODE=1
export NUM_DATAWORKERS=2
# set GPUs in .env
elif [[ $HARDWARE_SETTING == "1_2_a100" ]]; then
export BATCH_SIZE=16
export NUM_NODES=1
export NUM_GPUS_PER_NODE=2
export NUM_DATAWORKERS=2
# set GPUs in .env
# VSC5: SLURM
# --------------------------------------------------
elif [[ $HARDWARE_SETTING == "s2_2_a40" ]]; then
export BATCH_SIZE=20
export NUM_NODES=2
export NUM_GPUS_PER_NODE=2
export PARTITION=zen2_0256_a40x2
export QOS=zen2_0256_a40x2
export NUM_DATAWORKERS=2
# set GPUs in .env
elif [[ $HARDWARE_SETTING == "s1_2_a40" ]]; then
export BATCH_SIZE=20
export NUM_NODES=1
export NUM_GPUS_PER_NODE=2
export PARTITION=zen2_0256_a40x2
export QOS=zen2_0256_a40x2
export NUM_DATAWORKERS=2
# set GPUs in .env
elif [[ $HARDWARE_SETTING == "s1_1_a40" ]]; then
export CUDA_VISIBLE_DEVICES=0
export BATCH_SIZE=20
export NUM_NODES=1
export NUM_GPUS_PER_NODE=1
export PARTITION=zen2_0256_a40x2
export QOS=zen2_0256_a40x2
export NUM_DATAWORKERS=2
# set GPUs in .env
else
echo "Hardware setting not available. "
fi