Skip to content

Commit

Permalink
Merge pull request #2350 from tomaarsen/tests/simplify_and_ci
Browse files Browse the repository at this point in the history
`[ci]` Simplify tests, add CI, patch `paraphrase_mining_embeddings`
  • Loading branch information
tomaarsen authored Dec 12, 2023
2 parents b93aac1 + b6553e8 commit 5da2188
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 54 deletions.
52 changes: 52 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Unit tests

on:
push:
branches:
- master
- v*-release
pull_request:
branches:
- master
workflow_dispatch:

jobs:

test_sampling:
name: Run unit tests
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
os: [ubuntu-latest, windows-latest]
fail-fast: false
runs-on: ${{ matrix.os }}
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Setup Python environment
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Try to load cached dependencies
uses: actions/cache@v3
id: restore-cache
with:
path: ${{ env.pythonLocation }}
key: python-dependencies-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ env.pythonLocation }}

- name: Install external dependencies on cache miss
run: |
python -m pip install --no-cache-dir --upgrade pip
python -m pip install --no-cache-dir -r requirements.txt
python -m pip install --no-cache-dir pytest
if: steps.restore-cache.outputs.cache-hit != 'true'

- name: Install the checked-out sentence-transformers
run: python -m pip install .

- name: Run unit tests
shell: bash
run: |
pytest -sv tests/
6 changes: 6 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[pytest]
testpaths =
tests
addopts = --strict-markers -m "not slow"
markers =
slow: marks tests as slow
2 changes: 1 addition & 1 deletion sentence_transformers/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def paraphrase_mining_embeddings(embeddings: Tensor,

if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs:
added_pairs.add((sorted_i, sorted_j))
pairs_list.append([score, i, j])
pairs_list.append([score, sorted_i, sorted_j])

# Highest scores first
pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True)
Expand Down
30 changes: 15 additions & 15 deletions tests/test_cross_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import gzip
import os
import unittest
import pytest

from torch.utils.data import DataLoader
import logging
from sentence_transformers import CrossEncoder, util, LoggingHandler
from sentence_transformers import CrossEncoder, util
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator

Expand All @@ -22,23 +22,20 @@ def setUp(self):

#Read STSB
self.stsb_train_samples = []
self.dev_samples = []
self.test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

if row['split'] == 'dev':
self.dev_samples.append(inp_example)
elif row['split'] == 'test':
if row['split'] == 'test':
self.test_samples.append(inp_example)
else:
elif row['split'] == 'train':
self.stsb_train_samples.append(inp_example)

def evaluate_stsb_test(self, model, expected_score):
evaluator = CECorrelationEvaluator.from_input_examples(self.test_samples, name='sts-test')
def evaluate_stsb_test(self, model, expected_score, num_test_samples: int = -1):
evaluator = CECorrelationEvaluator.from_input_examples(self.test_samples[:num_test_samples], name='sts-test')
score = evaluator(model)*100
print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format(score, expected_score))
assert score > expected_score or abs(score-expected_score) < 0.1
Expand All @@ -47,16 +44,19 @@ def test_pretrained_stsb(self):
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
self.evaluate_stsb_test(model, 87.92)

def test_train_stsb(self):
@pytest.mark.slow
def test_train_stsb_slow(self):
model = CrossEncoder('distilroberta-base', num_labels=1)
train_dataloader = DataLoader(self.stsb_train_samples, shuffle=True, batch_size=16)
model.fit(train_dataloader=train_dataloader,
epochs=1,
warmup_steps=int(len(train_dataloader)*0.1))
self.evaluate_stsb_test(model, 75)




if "__main__" == __name__:
unittest.main()
def test_train_stsb(self):
model = CrossEncoder('distilroberta-base', num_labels=1)
train_dataloader = DataLoader(self.stsb_train_samples[:500], shuffle=True, batch_size=16)
model.fit(train_dataloader=train_dataloader,
epochs=1,
warmup_steps=int(len(train_dataloader)*0.1))
self.evaluate_stsb_test(model, 50, num_test_samples=100)
96 changes: 68 additions & 28 deletions tests/test_pretrained_stsb.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,49 @@
"""
Tests that the pretrained models produce the correct scores on the STSbenchmark dataset
"""
from functools import partial
from sentence_transformers import SentenceTransformer, InputExample, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import unittest
import os
import gzip
import csv
import pytest

class PretrainedSTSbTest(unittest.TestCase):
def pretrained_model_score(model_name, expected_score, max_test_samples: int = 100):
model = SentenceTransformer(model_name)
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

def pretrained_model_score(self, model_name, expected_score):
model = SentenceTransformer(model_name)
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'
if not os.path.exists(sts_dataset_path):
util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

if not os.path.exists(sts_dataset_path):
util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)
if row['split'] == 'test':
test_samples.append(inp_example)
if max_test_samples != -1 and len(test_samples) >= max_test_samples:
break

if row['split'] == 'dev':
dev_samples.append(inp_example)
elif row['split'] == 'test':
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
score = model.evaluate(evaluator)*100
print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score))
assert score > expected_score or abs(score-expected_score) < 0.1

score = model.evaluate(evaluator)*100
print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score))
assert score > expected_score or abs(score-expected_score) < 0.1
@pytest.mark.slow
class TestPretrainedSTSbSlow:
pretrained_model_score = partial(pretrained_model_score, max_test_samples=-1)

def test_bert_base(self):
self.pretrained_model_score('bert-base-nli-mean-tokens', 77.12)
self.pretrained_model_score('bert-base-nli-max-tokens', 77.21)
self.pretrained_model_score('bert-base-nli-cls-token', 76.30)
self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 85.14)


def test_bert_large(self):
self.pretrained_model_score('bert-large-nli-mean-tokens', 79.19)
self.pretrained_model_score('bert-large-nli-max-tokens', 78.41)
Expand Down Expand Up @@ -81,5 +79,47 @@ def test_msmarco(self):
def test_sentence_t5(self):
self.pretrained_model_score('sentence-t5-base', 85.52)

if "__main__" == __name__:
unittest.main()

class TestPretrainedSTSbFast:
pretrained_model_score = partial(pretrained_model_score, max_test_samples=100)

def test_bert_base(self):
self.pretrained_model_score('bert-base-nli-mean-tokens', 86.53)
self.pretrained_model_score('bert-base-nli-max-tokens', 87.00)
self.pretrained_model_score('bert-base-nli-cls-token', 85.93)
self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 89.26)

def test_bert_large(self):
self.pretrained_model_score('bert-large-nli-mean-tokens', 90.06)
self.pretrained_model_score('bert-large-nli-max-tokens', 90.15)
self.pretrained_model_score('bert-large-nli-cls-token', 89.51)
self.pretrained_model_score('bert-large-nli-stsb-mean-tokens', 92.27)

def test_roberta(self):
self.pretrained_model_score('roberta-base-nli-mean-tokens', 87.91)
self.pretrained_model_score('roberta-large-nli-mean-tokens', 89.41)
self.pretrained_model_score('roberta-base-nli-stsb-mean-tokens', 93.39)
self.pretrained_model_score('roberta-large-nli-stsb-mean-tokens', 91.26)

def test_distilbert(self):
self.pretrained_model_score('distilbert-base-nli-mean-tokens', 88.83)
self.pretrained_model_score('distilbert-base-nli-stsb-mean-tokens', 91.01)
self.pretrained_model_score('paraphrase-distilroberta-base-v1', 90.89)

def test_multiling(self):
self.pretrained_model_score('distiluse-base-multilingual-cased', 88.79)
self.pretrained_model_score('paraphrase-xlm-r-multilingual-v1', 92.76)
self.pretrained_model_score('paraphrase-multilingual-MiniLM-L12-v2', 92.64)

def test_mpnet(self):
self.pretrained_model_score('paraphrase-mpnet-base-v2', 92.83)

def test_other_models(self):
self.pretrained_model_score('average_word_embeddings_komninos', 68.97)

def test_msmarco(self):
self.pretrained_model_score('msmarco-roberta-base-ance-firstp', 83.61)
self.pretrained_model_score('msmarco-distilbert-base-v3', 87.96)

def test_sentence_t5(self):
self.pretrained_model_score('sentence-t5-base', 92.75)
46 changes: 36 additions & 10 deletions tests/test_train_stsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import gzip
import os
import unittest
import pytest

from torch.utils.data import DataLoader

Expand Down Expand Up @@ -38,19 +39,16 @@ def setUp(self):

#Read STSB
self.stsb_train_samples = []
self.dev_samples = []
self.test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

if row['split'] == 'dev':
self.dev_samples.append(inp_example)
elif row['split'] == 'test':
if row['split'] == 'test':
self.test_samples.append(inp_example)
else:
elif row['split'] == 'train':
self.stsb_train_samples.append(inp_example)

def evaluate_stsb_test(self, model, expected_score):
Expand All @@ -59,7 +57,8 @@ def evaluate_stsb_test(self, model, expected_score):
print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format(score, expected_score))
assert score > expected_score or abs(score-expected_score) < 0.1

def test_train_stsb(self):
@pytest.mark.slow
def test_train_stsb_slow(self):
word_embedding_model = models.Transformer('distilbert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
Expand All @@ -75,7 +74,24 @@ def test_train_stsb(self):

self.evaluate_stsb_test(model, 80.0)

def test_train_nli(self):
def test_train_stsb(self):
word_embedding_model = models.Transformer('distilbert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
train_dataset = SentencesDataset(self.stsb_train_samples[:100], model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model=model)
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=None,
epochs=1,
evaluation_steps=1000,
warmup_steps=int(len(train_dataloader)*0.1),
use_amp=True)

self.evaluate_stsb_test(model, 60.0)

@pytest.mark.slow
def test_train_nli_slow(self):
word_embedding_model = models.Transformer('distilbert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
Expand All @@ -90,7 +106,17 @@ def test_train_nli(self):

self.evaluate_stsb_test(model, 50.0)

def test_train_nli(self):
word_embedding_model = models.Transformer('distilbert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
train_dataset = SentencesDataset(self.nli_train_samples[:100], model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=3)
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=None,
epochs=1,
warmup_steps=int(len(train_dataloader) * 0.1),
use_amp=True)


if "__main__" == __name__:
unittest.main()
self.evaluate_stsb_test(model, 50.0)

0 comments on commit 5da2188

Please sign in to comment.