-
Notifications
You must be signed in to change notification settings - Fork 3
/
generate-divergences.sh
70 lines (57 loc) · 2.93 KB
/
generate-divergences.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env bash
#############################################################################
# #
# #
# Detecting Fine-Grained Cross-Lingual Semantic Divergences #
# without Supervision by Learning To Rank #
# #
# eleftheria #
# #
# ==== Step 3 ==== #
# #
# Mimic synthetic divergences #
# #
# #
#############################################################################
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/fs/clip-scratch/ebriakou/anaconda3/lib
# ==== Set directories
root_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
data_dir=$root_dir/data
scripts_dir=$root_dir/source
seed_file=WikiMatrix.$1-$2.tsv.filtered_sample_50000.moses.seed
seeds=$data_dir/wikimatrix/$seed_file
# === Set dependencies
export NLTK_DATA=$data_dir/nltk_data
python -m spacy download 'en_core_web_sm'
echo $'\n> Generate synthetic divergences from seed equivalents'
for process in i u d r g p; do
echo $'\n--- Divergent type: '$process$' ---\n'
python $scripts_dir/generate_divergent_data.py \
--mode $process \
--data $seeds \
--output synthetic \
--bert_local_cache pretrained_bert \
--pretrained_bert "bert-base-cased" \
#--debug
done
cut -f 1-2 $seeds > ${seeds}_exclude_align
echo $'\n> Prepare divergence ranking for sentence-level divergentmBERT'
python $scripts_dir/build_bert_training_data.py \
--path_to_unlabeled ${seeds}_exclude_align \
--path_to_divergences $root_dir/synthetic/from_$seed_file \
--divergent_list rdpg \
--contrastive \
--learn-to-rank \
--divergence-ranking
echo $'\n> Prepare divergence ranking for multi-task divergentmBERT'
python $scripts_dir/build_bert_training_data.py \
--path_to_unlabeled ${seeds}_exclude_align \
--path_to_divergences $root_dir/synthetic/from_$seed_file \
--divergent_list rdpg \
--contrastive \
--learn-to-rank \
--divergence-ranking \
--multi-task
label_file=$root_dir/for_divergentmBERT/from_$seed_file/contrastive_multi_hard/rdpg/labels.txt
echo "O" > $label_file
echo "D" >> $label_file