forked from paracrawl/cirrus-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
06.align.sh
executable file
·72 lines (61 loc) · 2.06 KB
/
06.align.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/bash
## create and submit the batches on csd3 for alignment
set -euo pipefail
. ./env/init.sh
. ./config.sh
. ./functions.sh
function list_numeric_dirs {
find "$@" -mindepth 1 -maxdepth 1 -type d -regex '.*/[0-9]*'
}
function make_batch_list_all {
local collection="$1" lang="$2"
if $FORCE_INDEX_BATCHES || ! test -e ${COLLECTIONS[$collection]}-batches/06.${lang}-${TARGET_LANG}; then
for shard in $(list_numeric_dirs ${COLLECTIONS[$collection]}-shards/${lang}/); do
join -t$'\t' -j2 -o 1.1,2.1 \
<(list_numeric_dirs $shard) \
<(list_numeric_dirs ${COLLECTIONS[$collection]}-shards/${TARGET_LANG}/$(basename $shard))
done > ${COLLECTIONS[$collection]}-batches/06.${lang}-${TARGET_LANG}
fi
echo ${COLLECTIONS[$collection]}-batches/06.${lang}-${TARGET_LANG}
}
function make_batch_list_retry {
batch_list=${COLLECTIONS[$collection]}-batches/06.${lang}-${TARGET_LANG}.$(date '+%Y%m%d%H%M%S')
cat `make_batch_list_all "$@"` | while read SRC_BATCH REF_BATCH; do
alignments=$SRC_BATCH/aligned-$(basename $REF_BATCH).gz
# either if the alignments doesn't exist, or the tokenised_en.gz file is newer than aligned-n.gz
if [[ ! -e $alignments ]] || [[ $SRC_BATCH/tokenised_${TARGET_LANG%~*}.gz -nt $alignments ]]; then
echo $alignments 1>&2
printf '%s\t%s\n' "$SRC_BATCH" "$REF_BATCH"
fi
done | shuf > $batch_list
echo $batch_list
}
declare -a OPTIONS=(
--time 12:00:00
--cpus-per-task 4
-e ${SLURM_LOGS}/06.align-%A_%a.err
-o ${SLURM_LOGS}/06.align-%A_%a.out
)
# Quick hack, should be a --option option, but functions.sh doesn't
# allow for that at the moment. Someday...
if [[ ! -z ${OOM_PROOF:-} ]]; then
OPTIONS+=(--mem-per-cpu 12G)
export BLEUALIGN_THREADS=4
fi
collection=$1
shift
for lang in $*; do
batch_list=`make_batch_list $collection $lang`
job_list=`make_job_list $batch_list`
if [ ! -z $job_list ]; then
prompt "Scheduling $job_list\n"
if confirm; then
schedule \
-J align-${lang%~*}-${collection} \
-a $job_list \
${OPTIONS[@]} \
${SCRIPTS}/generic.slurm $batch_list \
${SCRIPTS}/06.align ${lang%~*}
fi
fi
done