-
Notifications
You must be signed in to change notification settings - Fork 0
/
train-0.7b.sh
92 lines (74 loc) · 2.26 KB
/
train-0.7b.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/bin/bash
set -x
NCCL_ENV="NCCL_IB_DISABLE=0 NCCL_IB_GID_INDEX=3 NCCL_NET_GDR_LEVEL=0"
NNODES=1
MP=1
g_bs=64
LAYERNUM=6
HIDDENSIZE=384
HEAD=6
MAX_TOKENS=16384
MAX_ALIGNS=1024
MAX_LENGTH=1024
POS_EMBED=1024
BATCHSIZE=1
DATE=dev
WS=16000
ITER=250000
NAME=${HIDDENSIZE}h-${LAYERNUM}l-${HEAD}hd-${BATCHSIZE}mbs-${g_bs}gbs-${MP}mp-${MAX_TOKENS_ALL}tokens-${MAX_ALIGNS}aligns-${MAX_LENGTH}length-${WS}ws-${ITER}iter-${DATE}
MYPATH=$PWD
GPUS_PER_NODE=8
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
MASTER_ADDR=localhost
MASTER_PORT=7100
DATA_PATH="/dataset/ee84df8b/workspace/DATA/TOTAL/TOTAL_text_document"
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank 0 --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
CHECKPOINT_PATH=/workspace/ckpt/$DATE/$NAME
TB=$MYPATH/tb/$DATE/$NAME
LOG=$MYPATH/logs/$DATE/$NAME
mkdir -p $CHECKPOINT_PATH
mkdir -p $MYPATH/tb/$DATE/
mkdir -p $TB
mkdir -p $MYPATH/logs/$DATE/
NCCL_IB_DISABLE=0
NCCL_IB_GID_INDEX=3
NCCL_NET_GDR_LEVEL=0
SE_ITER=50
(python -m torch.distributed.launch $DISTRIBUTED_ARGS \
$MYPATH/pretrain_msa.py \
--num-layers $LAYERNUM \
--hidden-size $HIDDENSIZE \
--num-attention-heads $HEAD \
--micro-batch-size $BATCHSIZE \
--global-batch-size ${g_bs} \
--seq-length $MAX_LENGTH \
--max-position-embeddings $POS_EMBED \
--max-msa-position-embeddings $MAX_ALIGNS \
--train-iters $ITER \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $MYPATH/msa_tools/msa_vocab.txt \
--data-impl mmap \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--clip-grad 1.0 \
--lr-warmup-iters $WS \
--log-interval 1 \
--save-interval $SE_ITER \
--eval-interval $SE_ITER \
--eval-iters 100 \
--max-tokens $MAX_TOKENS \
--max-aligns $MAX_ALIGNS \
--max-length $MAX_LENGTH \
--tensor-model-parallel-size $MP \
--fp16 \
--no-scaled-masked-softmax-fusion \
--tensorboard-dir $TB \
--checkpoint-activations \
--add-msa-positional-embedding \
--add-post-embedding-layernorm \
--split 9996,3,1 \
--override-lr-scheduler \
) |& tee -a $LOG