From 594143e353fb56dd5fa69200d4f559a36610ff8d Mon Sep 17 00:00:00 2001 From: Rui Melo <44201826+rufimelo99@users.noreply.github.com> Date: Thu, 14 Dec 2023 15:25:13 +0000 Subject: [PATCH] chore; update make_multilingual.py (#2243) * chore; update make_multilingual.py Update paths for parallel dataset * Update broken paths throughout the examples * Remove now-unused correct_bias --------- Co-authored-by: Tom Aarsen --- .../evaluation_translation_matching.py | 2 +- examples/training/multilingual/README.md | 4 +-- ..._ted2020.py => get_parallel_data_talks.py} | 26 +++++++++---------- .../multilingual/get_parallel_data_tatoeba.py | 2 +- .../multilingual/make_multilingual.py | 15 +++++------ .../multilingual/make_multilingual_sys.py | 4 +-- 6 files changed, 26 insertions(+), 27 deletions(-) rename examples/training/multilingual/{get_parallel_data_ted2020.py => get_parallel_data_talks.py} (71%) diff --git a/examples/evaluation/evaluation_translation_matching.py b/examples/evaluation/evaluation_translation_matching.py index 3a7576f2d..acbb92635 100644 --- a/examples/evaluation/evaluation_translation_matching.py +++ b/examples/evaluation/evaluation_translation_matching.py @@ -17,7 +17,7 @@ python [model_name_or_path] [parallel-file1] [parallel-file2] ... For example: -python distiluse-base-multilingual-cased TED2020-en-de.tsv.gz +python distiluse-base-multilingual-cased talks-en-de.tsv.gz See the training_multilingual/get_parallel_data_...py scripts for getting parallel sentence data from different sources """ diff --git a/examples/training/multilingual/README.md b/examples/training/multilingual/README.md index 1b23c2856..ba6fd3f28 100644 --- a/examples/training/multilingual/README.md +++ b/examples/training/multilingual/README.md @@ -113,7 +113,7 @@ In our experiments we initialized the student model with the multilingual XLM-Ro ## Training For a **fully automatic code example**, see [make_multilingual.py](make_multilingual.py). -This scripts downloads the [TED2020 corpus](https://github.com/UKPLab/sentence-transformers/blob/master/docs/datasets/TED2020.md?), a corpus with transcripts and translations from TED and TEDx talks. It than extends a monolingual model to several languages (en, de, es, it, fr, ar, tr). TED2020 contains parallel data for more than 100 languages, hence, you can simple change the script and train a multilingual model in your favorite languages. +This scripts downloads the parallel sentences corpus, a corpus with transcripts and translations from talks. It than extends a monolingual model to several languages (en, de, es, it, fr, ar, tr). This corpus contains parallel data for more than 100 languages, hence, you can simple change the script and train a multilingual model in your favorite languages. @@ -158,7 +158,7 @@ A great website for a vast number of parallel (translated) datasets is [OPUS](ht The [examples/training/multilingual](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/multilingual/) folder contains some scripts that downloads parallel training data and brings it into the right format: - [get_parallel_data_opus.py](get_parallel_data_opus.py): This script downloads data from the [OPUS](http://opus.nlpl.eu/) website. - [get_parallel_data_tatoeba.py](get_parallel_data_tatoeba.py): This script downloads data from the [Tatoeba](https://tatoeba.org/) website, a website for language learners with example sentences for more than many languages. -- [get_parallel_data_ted2020.py](get_parallel_data_ted2020.py): This script downloads data the [TED2020 corpus](https://github.com/UKPLab/sentence-transformers/blob/master/docs/datasets/TED2020.md), which contains transcripts and translations of more than 4,000 TED and TEDx talks in 100+ languages. +- [get_parallel_data_talks.py](get_parallel_data_talks.py): This script downloads data the parallel sentences corpus, which contains transcripts and translations of more than 4,000 talks in 100+ languages. ## Evaluation diff --git a/examples/training/multilingual/get_parallel_data_ted2020.py b/examples/training/multilingual/get_parallel_data_talks.py similarity index 71% rename from examples/training/multilingual/get_parallel_data_ted2020.py rename to examples/training/multilingual/get_parallel_data_talks.py index 2bfa603ca..0f9846436 100644 --- a/examples/training/multilingual/get_parallel_data_ted2020.py +++ b/examples/training/multilingual/get_parallel_data_talks.py @@ -1,10 +1,10 @@ """ -This script downloads the TED2020 corpus (https://github.com/UKPLab/sentence-transformers/blob/master/docs/datasets/TED2020.md) - and create parallel sentences tsv files that can be used to extend existent sentence embedding models to new languages. +This script downloads the parallel sentences corpus and create parallel sentences tsv files that can be used to extend +existent sentence embedding models to new languages. -The TED2020 corpus is a crawl of transcripts from TED and TEDx talks, which are translated to 100+ languages. +The parallel sentences corpus is a crawl of transcripts from talks, which are translated to 100+ languages. -The TED2020 corpus cannot be downloaded automatically. It is available for research purposes only (CC-BY-NC). +The parallel sentences corpus cannot be downloaded automatically. It is available for research purposes only (CC-BY-NC). The training procedure can be found in the files make_multilingual.py and make_multilingual_sys.py. @@ -24,17 +24,17 @@ dev_sentences = 1000 #Number of sentences we want to use for development -download_url = "" #Specify TED2020 URL here -ted2020_path = "../datasets/ted2020.tsv.gz" #Path of the TED2020.tsv.gz file. +download_url = "https://sbert.net/datasets/parallel-sentences.tsv.gz" #Specify parallel sentences URL here +parallel_sentences_path = "../datasets/parallel-sentences.tsv.gz" #Path of the parallel-sentences.tsv.gz file. parallel_sentences_folder = "parallel-sentences/" -os.makedirs(os.path.dirname(ted2020_path), exist_ok=True) -if not os.path.exists(ted2020_path): - print("ted2020.tsv.gz does not exists. Try to download from server") - sentence_transformers.util.http_get(download_url, ted2020_path) +os.makedirs(os.path.dirname(parallel_sentences_path), exist_ok=True) +if not os.path.exists(parallel_sentences_path): + print("parallel-sentences.tsv.gz does not exists. Try to download from server") + sentence_transformers.util.http_get(download_url, parallel_sentences_path) @@ -44,8 +44,8 @@ files_to_create = [] for source_lang in source_languages: for target_lang in target_languages: - output_filename_train = os.path.join(parallel_sentences_folder, "TED2020-{}-{}-train.tsv.gz".format(source_lang, target_lang)) - output_filename_dev = os.path.join(parallel_sentences_folder, "TED2020-{}-{}-dev.tsv.gz".format(source_lang, target_lang)) + output_filename_train = os.path.join(parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)) + output_filename_dev = os.path.join(parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)) train_files.append(output_filename_train) dev_files.append(output_filename_dev) if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev): @@ -57,7 +57,7 @@ if len(files_to_create) > 0: print("Parallel sentences files {} do not exist. Create these files now".format(", ".join(map(lambda x: x['src_lang']+"-"+x['trg_lang'], files_to_create)))) - with gzip.open(ted2020_path, 'rt', encoding='utf8') as fIn: + with gzip.open(parallel_sentences_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for line in tqdm(reader, desc="Sentences"): for outfile in files_to_create: diff --git a/examples/training/multilingual/get_parallel_data_tatoeba.py b/examples/training/multilingual/get_parallel_data_tatoeba.py index 07f39f233..00116836c 100644 --- a/examples/training/multilingual/get_parallel_data_tatoeba.py +++ b/examples/training/multilingual/get_parallel_data_tatoeba.py @@ -10,7 +10,7 @@ import gzip # Note: Tatoeba uses 3 letter languages codes (ISO-639-2), -# while other datasets like OPUS / TED2020 use 2 letter language codes (ISO-639-1) +# while other datasets like OPUS use 2 letter language codes (ISO-639-1) # For training of sentence transformers, which type of language code is used doesn't matter. # For language codes, see: https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes source_languages = set(['eng']) diff --git a/examples/training/multilingual/make_multilingual.py b/examples/training/multilingual/make_multilingual.py index 684414a95..3c93f6db7 100644 --- a/examples/training/multilingual/make_multilingual.py +++ b/examples/training/multilingual/make_multilingual.py @@ -9,9 +9,8 @@ with the first column a sentence in a language understood by the teacher model, e.g. English, and the further columns contain the according translations for languages you want to extend to. -This scripts downloads automatically the TED2020 corpus: https://github.com/UKPLab/sentence-transformers/blob/master/docs/datasets/TED2020.md -This corpus contains transcripts from -TED and TEDx talks, translated to 100+ languages. For other parallel data, see get_parallel_data_[].py scripts +This scripts downloads automatically the parallel sentences corpus. This corpus contains transcripts from +talks translated to 100+ languages. For other parallel data, see get_parallel_data_[].py scripts Further information can be found in our paper: Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation @@ -78,8 +77,8 @@ def download_corpora(filepaths): # Here we define train train and dev corpora -train_corpus = "datasets/ted2020.tsv.gz" # Transcripts of TED talks, crawled 2020 -sts_corpus = "datasets/STS2017-extended.zip" # Extended STS2017 dataset for more languages +train_corpus = "datasets/parallel-sentences.tsv.gz" +sts_corpus = "datasets/stsbenchmark.zip" parallel_sentences_folder = "parallel-sentences/" # Check if the file exists. If not, they are downloaded @@ -93,8 +92,8 @@ def download_corpora(filepaths): files_to_create = [] for source_lang in source_languages: for target_lang in target_languages: - output_filename_train = os.path.join(parallel_sentences_folder, "TED2020-{}-{}-train.tsv.gz".format(source_lang, target_lang)) - output_filename_dev = os.path.join(parallel_sentences_folder, "TED2020-{}-{}-dev.tsv.gz".format(source_lang, target_lang)) + output_filename_train = os.path.join(parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)) + output_filename_dev = os.path.join(parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)) train_files.append(output_filename_train) dev_files.append(output_filename_dev) if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev): @@ -217,5 +216,5 @@ def download_corpora(filepaths): evaluation_steps=num_evaluation_steps, output_path=output_path, save_best_model=True, - optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False} + optimizer_params= {'lr': 2e-5, 'eps': 1e-6} ) diff --git a/examples/training/multilingual/make_multilingual_sys.py b/examples/training/multilingual/make_multilingual_sys.py index f45fdfdba..443e80492 100644 --- a/examples/training/multilingual/make_multilingual_sys.py +++ b/examples/training/multilingual/make_multilingual_sys.py @@ -9,7 +9,7 @@ with the first column a sentence in a language understood by the teacher model, e.g. English, and the further columns contain the according translations for languages you want to extend to. -See get_parallel_data_[opus/tatoeba/ted2020].py for automatic download of parallel sentences datasets. +See get_parallel_data_[opus/tatoeba/talks].py for automatic download of parallel sentences datasets. Note: See make_multilingual.py for a fully automated script that downloads the necessary data and trains the model. This script just trains the model if you have already parallel data in the right format. @@ -23,7 +23,7 @@ python make_multilingual_sys.py train1.tsv.gz train2.tsv.gz train3.tsv.gz --dev dev1.tsv.gz dev2.tsv.gz For example: -python make_multilingual_sys.py parallel-sentences/TED2020-en-de-train.tsv.gz --dev parallel-sentences/TED2020-en-de-dev.tsv.gz +python make_multilingual_sys.py parallel-sentences/talks-en-de-train.tsv.gz --dev parallel-sentences/talks-en-de-dev.tsv.gz To load all training & dev files from a folder (Linux): python make_multilingual_sys.py parallel-sentences/*-train.tsv.gz --dev parallel-sentences/*-dev.tsv.gz