From ea7ea512e1b14c5842863a7438a445625f823f9f Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 12:54:48 +0100 Subject: [PATCH 01/12] Simplify tests, cheaper to run --- tests/test_cross_encoder.py | 11 +++++------ tests/test_pretrained_stsb.py | 12 ++++-------- tests/test_train_stsb.py | 12 +++++------- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/tests/test_cross_encoder.py b/tests/test_cross_encoder.py index f21853954..bd9b58e18 100644 --- a/tests/test_cross_encoder.py +++ b/tests/test_cross_encoder.py @@ -21,8 +21,9 @@ def setUp(self): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) #Read STSB + max_test_samples = 100 + max_train_samples = 500 self.stsb_train_samples = [] - self.dev_samples = [] self.test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) @@ -30,11 +31,9 @@ def setUp(self): score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) - if row['split'] == 'dev': - self.dev_samples.append(inp_example) - elif row['split'] == 'test': + if row['split'] == 'test' and len(self.test_samples) < max_test_samples: self.test_samples.append(inp_example) - else: + elif row['split'] == 'train' and len(self.stsb_train_samples) < max_train_samples: self.stsb_train_samples.append(inp_example) def evaluate_stsb_test(self, model, expected_score): @@ -53,7 +52,7 @@ def test_train_stsb(self): model.fit(train_dataloader=train_dataloader, epochs=1, warmup_steps=int(len(train_dataloader)*0.1)) - self.evaluate_stsb_test(model, 75) + self.evaluate_stsb_test(model, 50) diff --git a/tests/test_pretrained_stsb.py b/tests/test_pretrained_stsb.py index 95974c12b..2cbdecf0e 100644 --- a/tests/test_pretrained_stsb.py +++ b/tests/test_pretrained_stsb.py @@ -10,15 +10,13 @@ class PretrainedSTSbTest(unittest.TestCase): - def pretrained_model_score(self, model_name, expected_score): + def pretrained_model_score(self, model_name, expected_score, max_test_samples: int = 100): model = SentenceTransformer(model_name) sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) - train_samples = [] - dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) @@ -26,12 +24,10 @@ def pretrained_model_score(self, model_name, expected_score): score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) - if row['split'] == 'dev': - dev_samples.append(inp_example) - elif row['split'] == 'test': + if row['split'] == 'test': test_samples.append(inp_example) - else: - train_samples.append(inp_example) + if len(test_samples) >= max_test_samples: + break evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') diff --git a/tests/test_train_stsb.py b/tests/test_train_stsb.py index 7c7195847..a25d0670a 100644 --- a/tests/test_train_stsb.py +++ b/tests/test_train_stsb.py @@ -26,7 +26,7 @@ def setUp(self): #Read NLI label2int = {"contradiction": 0, "entailment": 1, "neutral": 2} self.nli_train_samples = [] - max_train_samples = 10000 + max_train_samples = 100 with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: @@ -38,19 +38,17 @@ def setUp(self): #Read STSB self.stsb_train_samples = [] - self.dev_samples = [] self.test_samples = [] + max_train_samples = 100 with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) - if row['split'] == 'dev': - self.dev_samples.append(inp_example) - elif row['split'] == 'test': + if row['split'] == 'test': self.test_samples.append(inp_example) - else: + elif row['split'] == 'train' and len(self.stsb_train_samples) < max_train_samples: self.stsb_train_samples.append(inp_example) def evaluate_stsb_test(self, model, expected_score): @@ -73,7 +71,7 @@ def test_train_stsb(self): warmup_steps=int(len(train_dataloader)*0.1), use_amp=True) - self.evaluate_stsb_test(model, 80.0) + self.evaluate_stsb_test(model, 65.0) def test_train_nli(self): word_embedding_model = models.Transformer('distilbert-base-uncased') From bfdcf202f4d94023ce45e3dfe9a90fabf6ad0145 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 12:55:14 +0100 Subject: [PATCH 02/12] Add simple CI --- .github/workflows/tests.yml | 73 +++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..c7c625f38 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,73 @@ +name: Unit tests + +on: + push: + branches: + - master + - v*-release + pull_request: + branches: + - master + workflow_dispatch: + +jobs: + + test_sampling: + name: Run unit tests + strategy: + matrix: + python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + os: [ubuntu-latest, windows-latest] + fail-fast: false + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Setup Python environment + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Try to load cached dependencies + uses: actions/cache@v3 + id: restore-cache + with: + path: ${{ env.pythonLocation }} + key: python-dependencies-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ env.pythonLocation }} + + - name: Install external dependencies on cache miss + run: | + python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir . + if: steps.restore-cache.outputs.cache-hit != 'true' + + - name: Install the checked-out setfit + run: python -m pip install . + + - name: Restore HF models from cache + uses: actions/cache/restore@v3 + with: + path: | + ~/.cache/huggingface/hub + ~/.cache/torch + key: hf-models-${{ matrix.os }}-${{ env.NEW_HF_CACHE_HASH }} + restore-keys: | + hf-models-${{ matrix.os }}- + + - name: Run unit tests + shell: bash + run: | + echo "OLD_HF_CACHE_HASH=$(find ~/.cache/huggingface/hub ~/.cache/torch -type f -exec sha256sum {} + | LC_ALL=C sort | sha256sum | cut -d ' ' -f 1)" >> $GITHUB_ENV + pytest -sv tests/ + echo "NEW_HF_CACHE_HASH=$(find ~/.cache/huggingface/hub ~/.cache/torch -type f -exec sha256sum {} + | LC_ALL=C sort | sha256sum | cut -d ' ' -f 1)" >> $GITHUB_ENV + + - name: Save new HF models to cache + uses: actions/cache/save@v3 + with: + path: | + ~/.cache/huggingface/hub + ~/.cache/torch + key: hf-models-${{ matrix.os }}-${{ env.NEW_HF_CACHE_HASH }} + # Only save cache if the hash has changed + if: env.NEW_HF_CACHE_HASH != env.OLD_HF_CACHE_HASH From c1473cf2f4dd206da7b39c37ac5a50a522036ee4 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 13:00:18 +0100 Subject: [PATCH 03/12] Also install pytest --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c7c625f38..e6ad3459a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -40,6 +40,7 @@ jobs: run: | python -m pip install --no-cache-dir --upgrade pip python -m pip install --no-cache-dir . + python -m pip install --no-cache-dir pytest if: steps.restore-cache.outputs.cache-hit != 'true' - name: Install the checked-out setfit From 5a28e81d4ef25780358b0eeb87727f351e8d73ea Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 13:01:13 +0100 Subject: [PATCH 04/12] Remove Python 3.12, not supported with torch yet --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e6ad3459a..a3b7bb133 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: name: Run unit tests strategy: matrix: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] os: [ubuntu-latest, windows-latest] fail-fast: false runs-on: ${{ matrix.os }} @@ -43,7 +43,7 @@ jobs: python -m pip install --no-cache-dir pytest if: steps.restore-cache.outputs.cache-hit != 'true' - - name: Install the checked-out setfit + - name: Install the checked-out sentence-transformers run: python -m pip install . - name: Restore HF models from cache From ef6db321b4e58f94002020fa124ec89c2e3f0a89 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 13:18:03 +0100 Subject: [PATCH 05/12] No CI for Python 3.6 - too deprecated I'll fully deprecate Python 3.7 soon too --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a3b7bb133..39b13eeb5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: name: Run unit tests strategy: matrix: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] os: [ubuntu-latest, windows-latest] fail-fast: false runs-on: ${{ matrix.os }} From 8293b5ca84030f17af1a738758694e27c6ce9cb2 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 13:30:08 +0100 Subject: [PATCH 06/12] Apply patch to paraphrase_mining_embeddings Ensure that i < j always holds true --- sentence_transformers/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 6361ec9a5..afacc3a5f 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -189,7 +189,7 @@ def paraphrase_mining_embeddings(embeddings: Tensor, if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs: added_pairs.add((sorted_i, sorted_j)) - pairs_list.append([score, i, j]) + pairs_list.append([score, sorted_i, sorted_j]) # Highest scores first pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True) From d6f8c343f3b4c8bccc2864c744cec2ec520d6709 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 13:58:36 +0100 Subject: [PATCH 07/12] Stop trying to cache loaded Models They're simply too big when combined. --- .github/workflows/tests.yml | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 39b13eeb5..f9ae650bd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -46,29 +46,7 @@ jobs: - name: Install the checked-out sentence-transformers run: python -m pip install . - - name: Restore HF models from cache - uses: actions/cache/restore@v3 - with: - path: | - ~/.cache/huggingface/hub - ~/.cache/torch - key: hf-models-${{ matrix.os }}-${{ env.NEW_HF_CACHE_HASH }} - restore-keys: | - hf-models-${{ matrix.os }}- - - name: Run unit tests shell: bash run: | - echo "OLD_HF_CACHE_HASH=$(find ~/.cache/huggingface/hub ~/.cache/torch -type f -exec sha256sum {} + | LC_ALL=C sort | sha256sum | cut -d ' ' -f 1)" >> $GITHUB_ENV pytest -sv tests/ - echo "NEW_HF_CACHE_HASH=$(find ~/.cache/huggingface/hub ~/.cache/torch -type f -exec sha256sum {} + | LC_ALL=C sort | sha256sum | cut -d ' ' -f 1)" >> $GITHUB_ENV - - - name: Save new HF models to cache - uses: actions/cache/save@v3 - with: - path: | - ~/.cache/huggingface/hub - ~/.cache/torch - key: hf-models-${{ matrix.os }}-${{ env.NEW_HF_CACHE_HASH }} - # Only save cache if the hash has changed - if: env.NEW_HF_CACHE_HASH != env.OLD_HF_CACHE_HASH From 2db0b9f77d063ca64d9b5494449807cf79a3c711 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 14:34:21 +0100 Subject: [PATCH 08/12] Increase lower bound of pretrained scores But now based on 100 test samples instead --- tests/test_pretrained_stsb.py | 46 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/test_pretrained_stsb.py b/tests/test_pretrained_stsb.py index 2cbdecf0e..db804ac85 100644 --- a/tests/test_pretrained_stsb.py +++ b/tests/test_pretrained_stsb.py @@ -36,46 +36,46 @@ def pretrained_model_score(self, model_name, expected_score, max_test_samples: i assert score > expected_score or abs(score-expected_score) < 0.1 def test_bert_base(self): - self.pretrained_model_score('bert-base-nli-mean-tokens', 77.12) - self.pretrained_model_score('bert-base-nli-max-tokens', 77.21) - self.pretrained_model_score('bert-base-nli-cls-token', 76.30) - self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 85.14) + self.pretrained_model_score('bert-base-nli-mean-tokens', 86.53) + self.pretrained_model_score('bert-base-nli-max-tokens', 87.00) + self.pretrained_model_score('bert-base-nli-cls-token', 85.93) + self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 89.26) def test_bert_large(self): - self.pretrained_model_score('bert-large-nli-mean-tokens', 79.19) - self.pretrained_model_score('bert-large-nli-max-tokens', 78.41) - self.pretrained_model_score('bert-large-nli-cls-token', 78.29) - self.pretrained_model_score('bert-large-nli-stsb-mean-tokens', 85.29) + self.pretrained_model_score('bert-large-nli-mean-tokens', 90.06) + self.pretrained_model_score('bert-large-nli-max-tokens', 90.15) + self.pretrained_model_score('bert-large-nli-cls-token', 89.51) + self.pretrained_model_score('bert-large-nli-stsb-mean-tokens', 92.27) def test_roberta(self): - self.pretrained_model_score('roberta-base-nli-mean-tokens', 77.49) - self.pretrained_model_score('roberta-large-nli-mean-tokens', 78.69) - self.pretrained_model_score('roberta-base-nli-stsb-mean-tokens', 85.30) - self.pretrained_model_score('roberta-large-nli-stsb-mean-tokens', 86.39) + self.pretrained_model_score('roberta-base-nli-mean-tokens', 87.91) + self.pretrained_model_score('roberta-large-nli-mean-tokens', 89.41) + self.pretrained_model_score('roberta-base-nli-stsb-mean-tokens', 93.39) + self.pretrained_model_score('roberta-large-nli-stsb-mean-tokens', 91.26) def test_distilbert(self): - self.pretrained_model_score('distilbert-base-nli-mean-tokens', 78.69) - self.pretrained_model_score('distilbert-base-nli-stsb-mean-tokens', 85.16) - self.pretrained_model_score('paraphrase-distilroberta-base-v1', 81.81) + self.pretrained_model_score('distilbert-base-nli-mean-tokens', 88.83) + self.pretrained_model_score('distilbert-base-nli-stsb-mean-tokens', 91.01) + self.pretrained_model_score('paraphrase-distilroberta-base-v1', 90.89) def test_multiling(self): - self.pretrained_model_score('distiluse-base-multilingual-cased', 80.75) - self.pretrained_model_score('paraphrase-xlm-r-multilingual-v1', 83.50) - self.pretrained_model_score('paraphrase-multilingual-MiniLM-L12-v2', 84.42) + self.pretrained_model_score('distiluse-base-multilingual-cased', 88.79) + self.pretrained_model_score('paraphrase-xlm-r-multilingual-v1', 92.76) + self.pretrained_model_score('paraphrase-multilingual-MiniLM-L12-v2', 92.64) def test_mpnet(self): - self.pretrained_model_score('paraphrase-mpnet-base-v2', 86.99) + self.pretrained_model_score('paraphrase-mpnet-base-v2', 92.83) def test_other_models(self): - self.pretrained_model_score('average_word_embeddings_komninos', 61.56) + self.pretrained_model_score('average_word_embeddings_komninos', 68.97) def test_msmarco(self): - self.pretrained_model_score('msmarco-roberta-base-ance-firstp', 77.0) - self.pretrained_model_score('msmarco-distilbert-base-v3', 78.85) + self.pretrained_model_score('msmarco-roberta-base-ance-firstp', 83.61) + self.pretrained_model_score('msmarco-distilbert-base-v3', 87.96) def test_sentence_t5(self): - self.pretrained_model_score('sentence-t5-base', 85.52) + self.pretrained_model_score('sentence-t5-base', 92.75) if "__main__" == __name__: unittest.main() \ No newline at end of file From 44a23f59e9a58adedcf799fe37bf738aca6204f8 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 14:41:20 +0100 Subject: [PATCH 09/12] Prevent duplicate ST install --- .github/workflows/tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f9ae650bd..4b78de05b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -39,7 +39,6 @@ jobs: - name: Install external dependencies on cache miss run: | python -m pip install --no-cache-dir --upgrade pip - python -m pip install --no-cache-dir . python -m pip install --no-cache-dir pytest if: steps.restore-cache.outputs.cache-hit != 'true' From b49c772d458a9c7d76293edb27555095bb425607 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 14:42:38 +0100 Subject: [PATCH 10/12] Install requirements.txt within the cached section --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4b78de05b..548051825 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -39,6 +39,7 @@ jobs: - name: Install external dependencies on cache miss run: | python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir -r requirements.txt python -m pip install --no-cache-dir pytest if: steps.restore-cache.outputs.cache-hit != 'true' From 41ad475518b9a1d5e695dfd13fcdc5f059a4e9ce Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 7 Nov 2023 14:56:03 +0100 Subject: [PATCH 11/12] Relax lower bound on training test These kinds of tests can be a bit flimsy, this should help --- tests/test_train_stsb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_train_stsb.py b/tests/test_train_stsb.py index a25d0670a..952808ecc 100644 --- a/tests/test_train_stsb.py +++ b/tests/test_train_stsb.py @@ -71,7 +71,7 @@ def test_train_stsb(self): warmup_steps=int(len(train_dataloader)*0.1), use_amp=True) - self.evaluate_stsb_test(model, 65.0) + self.evaluate_stsb_test(model, 60.0) def test_train_nli(self): word_embedding_model = models.Transformer('distilbert-base-uncased') From b6553e82a4431c0e2255b0901ffada0e3cf24021 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 7 Dec 2023 14:05:42 +0100 Subject: [PATCH 12/12] Reintroduce slow tests, can be ran with 'pytest -m slow' --- pytest.ini | 6 +++ tests/test_cross_encoder.py | 31 ++++++------ tests/test_pretrained_stsb.py | 94 +++++++++++++++++++++++++---------- tests/test_train_stsb.py | 44 +++++++++++++--- 4 files changed, 127 insertions(+), 48 deletions(-) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 000000000..5b46e89a2 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +testpaths = + tests +addopts = --strict-markers -m "not slow" +markers = + slow: marks tests as slow \ No newline at end of file diff --git a/tests/test_cross_encoder.py b/tests/test_cross_encoder.py index bd9b58e18..f2e5ba510 100644 --- a/tests/test_cross_encoder.py +++ b/tests/test_cross_encoder.py @@ -5,10 +5,10 @@ import gzip import os import unittest +import pytest from torch.utils.data import DataLoader -import logging -from sentence_transformers import CrossEncoder, util, LoggingHandler +from sentence_transformers import CrossEncoder, util from sentence_transformers.readers import InputExample from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator @@ -21,8 +21,6 @@ def setUp(self): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) #Read STSB - max_test_samples = 100 - max_train_samples = 500 self.stsb_train_samples = [] self.test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: @@ -31,13 +29,13 @@ def setUp(self): score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) - if row['split'] == 'test' and len(self.test_samples) < max_test_samples: + if row['split'] == 'test': self.test_samples.append(inp_example) - elif row['split'] == 'train' and len(self.stsb_train_samples) < max_train_samples: + elif row['split'] == 'train': self.stsb_train_samples.append(inp_example) - def evaluate_stsb_test(self, model, expected_score): - evaluator = CECorrelationEvaluator.from_input_examples(self.test_samples, name='sts-test') + def evaluate_stsb_test(self, model, expected_score, num_test_samples: int = -1): + evaluator = CECorrelationEvaluator.from_input_examples(self.test_samples[:num_test_samples], name='sts-test') score = evaluator(model)*100 print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert score > expected_score or abs(score-expected_score) < 0.1 @@ -46,16 +44,19 @@ def test_pretrained_stsb(self): model = CrossEncoder("cross-encoder/stsb-distilroberta-base") self.evaluate_stsb_test(model, 87.92) - def test_train_stsb(self): + @pytest.mark.slow + def test_train_stsb_slow(self): model = CrossEncoder('distilroberta-base', num_labels=1) train_dataloader = DataLoader(self.stsb_train_samples, shuffle=True, batch_size=16) model.fit(train_dataloader=train_dataloader, epochs=1, warmup_steps=int(len(train_dataloader)*0.1)) - self.evaluate_stsb_test(model, 50) - - + self.evaluate_stsb_test(model, 75) - -if "__main__" == __name__: - unittest.main() \ No newline at end of file + def test_train_stsb(self): + model = CrossEncoder('distilroberta-base', num_labels=1) + train_dataloader = DataLoader(self.stsb_train_samples[:500], shuffle=True, batch_size=16) + model.fit(train_dataloader=train_dataloader, + epochs=1, + warmup_steps=int(len(train_dataloader)*0.1)) + self.evaluate_stsb_test(model, 50, num_test_samples=100) diff --git a/tests/test_pretrained_stsb.py b/tests/test_pretrained_stsb.py index db804ac85..0bd210871 100644 --- a/tests/test_pretrained_stsb.py +++ b/tests/test_pretrained_stsb.py @@ -1,39 +1,87 @@ """ Tests that the pretrained models produce the correct scores on the STSbenchmark dataset """ +from functools import partial from sentence_transformers import SentenceTransformer, InputExample, util from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator -import unittest import os import gzip import csv +import pytest -class PretrainedSTSbTest(unittest.TestCase): +def pretrained_model_score(model_name, expected_score, max_test_samples: int = 100): + model = SentenceTransformer(model_name) + sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' - def pretrained_model_score(self, model_name, expected_score, max_test_samples: int = 100): - model = SentenceTransformer(model_name) - sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' + if not os.path.exists(sts_dataset_path): + util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) - if not os.path.exists(sts_dataset_path): - util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) + test_samples = [] + with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: + reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) + for row in reader: + score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 + inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) - test_samples = [] - with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: - reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) - for row in reader: - score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 - inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) + if row['split'] == 'test': + test_samples.append(inp_example) + if max_test_samples != -1 and len(test_samples) >= max_test_samples: + break - if row['split'] == 'test': - test_samples.append(inp_example) - if len(test_samples) >= max_test_samples: - break + evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') - evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') + score = model.evaluate(evaluator)*100 + print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) + assert score > expected_score or abs(score-expected_score) < 0.1 - score = model.evaluate(evaluator)*100 - print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) - assert score > expected_score or abs(score-expected_score) < 0.1 +@pytest.mark.slow +class TestPretrainedSTSbSlow: + pretrained_model_score = partial(pretrained_model_score, max_test_samples=-1) + + def test_bert_base(self): + self.pretrained_model_score('bert-base-nli-mean-tokens', 77.12) + self.pretrained_model_score('bert-base-nli-max-tokens', 77.21) + self.pretrained_model_score('bert-base-nli-cls-token', 76.30) + self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 85.14) + + def test_bert_large(self): + self.pretrained_model_score('bert-large-nli-mean-tokens', 79.19) + self.pretrained_model_score('bert-large-nli-max-tokens', 78.41) + self.pretrained_model_score('bert-large-nli-cls-token', 78.29) + self.pretrained_model_score('bert-large-nli-stsb-mean-tokens', 85.29) + + def test_roberta(self): + self.pretrained_model_score('roberta-base-nli-mean-tokens', 77.49) + self.pretrained_model_score('roberta-large-nli-mean-tokens', 78.69) + self.pretrained_model_score('roberta-base-nli-stsb-mean-tokens', 85.30) + self.pretrained_model_score('roberta-large-nli-stsb-mean-tokens', 86.39) + + def test_distilbert(self): + self.pretrained_model_score('distilbert-base-nli-mean-tokens', 78.69) + self.pretrained_model_score('distilbert-base-nli-stsb-mean-tokens', 85.16) + self.pretrained_model_score('paraphrase-distilroberta-base-v1', 81.81) + + def test_multiling(self): + self.pretrained_model_score('distiluse-base-multilingual-cased', 80.75) + self.pretrained_model_score('paraphrase-xlm-r-multilingual-v1', 83.50) + self.pretrained_model_score('paraphrase-multilingual-MiniLM-L12-v2', 84.42) + + def test_mpnet(self): + self.pretrained_model_score('paraphrase-mpnet-base-v2', 86.99) + + def test_other_models(self): + self.pretrained_model_score('average_word_embeddings_komninos', 61.56) + + def test_msmarco(self): + self.pretrained_model_score('msmarco-roberta-base-ance-firstp', 77.0) + self.pretrained_model_score('msmarco-distilbert-base-v3', 78.85) + + def test_sentence_t5(self): + self.pretrained_model_score('sentence-t5-base', 85.52) + + +class TestPretrainedSTSbFast: + pretrained_model_score = partial(pretrained_model_score, max_test_samples=100) def test_bert_base(self): self.pretrained_model_score('bert-base-nli-mean-tokens', 86.53) @@ -41,7 +89,6 @@ def test_bert_base(self): self.pretrained_model_score('bert-base-nli-cls-token', 85.93) self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 89.26) - def test_bert_large(self): self.pretrained_model_score('bert-large-nli-mean-tokens', 90.06) self.pretrained_model_score('bert-large-nli-max-tokens', 90.15) @@ -76,6 +123,3 @@ def test_msmarco(self): def test_sentence_t5(self): self.pretrained_model_score('sentence-t5-base', 92.75) - -if "__main__" == __name__: - unittest.main() \ No newline at end of file diff --git a/tests/test_train_stsb.py b/tests/test_train_stsb.py index 952808ecc..b2d72206e 100644 --- a/tests/test_train_stsb.py +++ b/tests/test_train_stsb.py @@ -5,6 +5,7 @@ import gzip import os import unittest +import pytest from torch.utils.data import DataLoader @@ -26,7 +27,7 @@ def setUp(self): #Read NLI label2int = {"contradiction": 0, "entailment": 1, "neutral": 2} self.nli_train_samples = [] - max_train_samples = 100 + max_train_samples = 10000 with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: @@ -39,7 +40,6 @@ def setUp(self): #Read STSB self.stsb_train_samples = [] self.test_samples = [] - max_train_samples = 100 with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: @@ -48,7 +48,7 @@ def setUp(self): if row['split'] == 'test': self.test_samples.append(inp_example) - elif row['split'] == 'train' and len(self.stsb_train_samples) < max_train_samples: + elif row['split'] == 'train': self.stsb_train_samples.append(inp_example) def evaluate_stsb_test(self, model, expected_score): @@ -57,7 +57,8 @@ def evaluate_stsb_test(self, model, expected_score): print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert score > expected_score or abs(score-expected_score) < 0.1 - def test_train_stsb(self): + @pytest.mark.slow + def test_train_stsb_slow(self): word_embedding_model = models.Transformer('distilbert-base-uncased') pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) @@ -71,9 +72,26 @@ def test_train_stsb(self): warmup_steps=int(len(train_dataloader)*0.1), use_amp=True) + self.evaluate_stsb_test(model, 80.0) + + def test_train_stsb(self): + word_embedding_model = models.Transformer('distilbert-base-uncased') + pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) + train_dataset = SentencesDataset(self.stsb_train_samples[:100], model) + train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) + train_loss = losses.CosineSimilarityLoss(model=model) + model.fit(train_objectives=[(train_dataloader, train_loss)], + evaluator=None, + epochs=1, + evaluation_steps=1000, + warmup_steps=int(len(train_dataloader)*0.1), + use_amp=True) + self.evaluate_stsb_test(model, 60.0) - def test_train_nli(self): + @pytest.mark.slow + def test_train_nli_slow(self): word_embedding_model = models.Transformer('distilbert-base-uncased') pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) @@ -88,7 +106,17 @@ def test_train_nli(self): self.evaluate_stsb_test(model, 50.0) + def test_train_nli(self): + word_embedding_model = models.Transformer('distilbert-base-uncased') + pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) + train_dataset = SentencesDataset(self.nli_train_samples[:100], model=model) + train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) + train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=3) + model.fit(train_objectives=[(train_dataloader, train_loss)], + evaluator=None, + epochs=1, + warmup_steps=int(len(train_dataloader) * 0.1), + use_amp=True) - -if "__main__" == __name__: - unittest.main() \ No newline at end of file + self.evaluate_stsb_test(model, 50.0)