From 1c31322432a7a41c88255a15773739233befc2af Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Wed, 6 Nov 2024 09:19:22 +0100 Subject: [PATCH 1/3] [`fix`] Avoid passing eval_dataset=None to transformers due to >=v4.46.0 crash (#3035) --- sentence_transformers/trainer.py | 8 +++++- tests/test_trainer.py | 46 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/sentence_transformers/trainer.py b/sentence_transformers/trainer.py index 3fd20eb1f..2a7907e3a 100644 --- a/sentence_transformers/trainer.py +++ b/sentence_transformers/trainer.py @@ -209,7 +209,7 @@ def __init__( "args": args, "data_collator": data_collator, "train_dataset": train_dataset, - "eval_dataset": eval_dataset, + "eval_dataset": eval_dataset if eval_dataset is not None or evaluator is None else "dummy", "model_init": model_init, "compute_metrics": compute_metrics, "callbacks": callbacks, @@ -222,6 +222,12 @@ def __init__( else: super_kwargs["tokenizer"] = tokenizer super().__init__(**super_kwargs) + # Transformers v4.46.0 introduced a ValueError if `eval_dataset` is None while eval_strategy is not "no", + # but in Sentence Transformers you can also evaluate without an eval_dataset via an evaluator, so we set + # it to "dummy" in that case to avoid the ValueError + if self.eval_dataset == "dummy": + self.eval_dataset = None + # Every Sentence Transformer model can always return a loss, so we set this to True # to avoid having to specify it in the data collator or model's forward self.can_return_loss = True diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 64fbf827e..e7e9827d5 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -8,8 +8,10 @@ import pytest import torch +from datasets.dataset_dict import DatasetDict from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses +from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.training_args import SentenceTransformerTrainingArguments from sentence_transformers.util import is_datasets_available, is_training_available from tests.utils import SafeTemporaryDirectory @@ -230,3 +232,47 @@ def test_trainer( original_embeddings = original_model.encode("The cat is on the mat.", convert_to_tensor=True) new_embeddings = model.encode("The cat is on the the mat.", convert_to_tensor=True) assert not torch.equal(original_embeddings, new_embeddings) + + +@pytest.mark.parametrize("use_eval_dataset", [True, False]) +@pytest.mark.parametrize("use_evaluator", [True, False]) +def test_trainer_no_eval_dataset_with_eval_strategy( + stsb_bert_tiny_model: SentenceTransformer, + stsb_dataset_dict: DatasetDict, + use_eval_dataset: bool, + use_evaluator: bool, + tmp_path: Path, +) -> None: + # Expect a crash when `args.eval_strategy` is not "no" but neither `eval_dataset` or `evaluator` is provided + # Otherwise, the trainer should be created without any issues + model = stsb_bert_tiny_model + train_dataset = stsb_dataset_dict["train"].select(range(10)) + eval_dataset = stsb_dataset_dict["validation"].select(range(10)) + evaluator = EmbeddingSimilarityEvaluator( + sentences1=eval_dataset["sentence1"], + sentences2=eval_dataset["sentence2"], + scores=[score / 5 for score in eval_dataset["score"]], + name="stsb-validation", + ) + loss = losses.CosineSimilarityLoss(model=model) + args = SentenceTransformerTrainingArguments(output_dir=tmp_path, eval_strategy="steps") + + kwargs = {} + if use_eval_dataset: + kwargs["eval_dataset"] = eval_dataset + if use_evaluator: + kwargs["evaluator"] = evaluator + + if not use_eval_dataset and not use_evaluator: + context = pytest.raises(ValueError, match=".*`args.eval_strategy`.*") + else: + context = nullcontext() + + with context: + SentenceTransformerTrainer( + model=model, + args=args, + train_dataset=train_dataset, + loss=loss, + **kwargs, + ) From 4b7a2d6ce9ed3068b2528b47a77709a3cd15e166 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Wed, 6 Nov 2024 09:19:31 +0100 Subject: [PATCH 2/3] [`docs`] Update the dated example in the NanoBEIREvaluator (#3034) --- .../evaluation/NanoBEIREvaluator.py | 101 +++++++++++------- 1 file changed, 62 insertions(+), 39 deletions(-) diff --git a/sentence_transformers/evaluation/NanoBEIREvaluator.py b/sentence_transformers/evaluation/NanoBEIREvaluator.py index 19cb7ce60..1e21f2254 100644 --- a/sentence_transformers/evaluation/NanoBEIREvaluator.py +++ b/sentence_transformers/evaluation/NanoBEIREvaluator.py @@ -84,8 +84,7 @@ class NanoBEIREvaluator(SentenceEvaluator): from sentence_transformers import SentenceTransformer from sentence_transformers.evaluation import NanoBEIREvaluator - # Load a model - model = SentenceTransformer('all-mpnet-base-v2') + model = SentenceTransformer('intfloat/multilingual-e5-large-instruct') datasets = ["QuoraRetrieval", "MSMARCO"] query_prompts = { @@ -95,54 +94,78 @@ class NanoBEIREvaluator(SentenceEvaluator): evaluator = NanoBEIREvaluator( dataset_names=datasets, - name="NanoBEIR", query_prompts=query_prompts, ) results = evaluator(model) ''' NanoBEIR Evaluation of the model on ['QuoraRetrieval', 'MSMARCO'] dataset: - Evaluating NanoBeIRNanoQuoraRetrieval - Evaluating NanoBeIRNanoMSMARCO - + Evaluating NanoQuoraRetrieval + Information Retrieval Evaluation of the model on the NanoQuoraRetrieval dataset: + Queries: 50 + Corpus: 5046 + + Score-Function: cosine + Accuracy@1: 92.00% + Accuracy@3: 98.00% + Accuracy@5: 100.00% + Accuracy@10: 100.00% + Precision@1: 92.00% + Precision@3: 40.67% + Precision@5: 26.00% + Precision@10: 14.00% + Recall@1: 81.73% + Recall@3: 94.20% + Recall@5: 97.93% + Recall@10: 100.00% + MRR@10: 0.9540 + NDCG@10: 0.9597 + MAP@100: 0.9395 + + Evaluating NanoMSMARCO + Information Retrieval Evaluation of the model on the NanoMSMARCO dataset: + Queries: 50 + Corpus: 5043 + + Score-Function: cosine + Accuracy@1: 40.00% + Accuracy@3: 74.00% + Accuracy@5: 78.00% + Accuracy@10: 88.00% + Precision@1: 40.00% + Precision@3: 24.67% + Precision@5: 15.60% + Precision@10: 8.80% + Recall@1: 40.00% + Recall@3: 74.00% + Recall@5: 78.00% + Recall@10: 88.00% + MRR@10: 0.5849 + NDCG@10: 0.6572 + MAP@100: 0.5892 Average Queries: 50.0 Average Corpus: 5044.5 Aggregated for Score Function: cosine - Accuracy@1: 39.00% - Accuracy@3: 57.00% - Accuracy@5: 66.00% - Accuracy@10: 77.00% - Precision@1: 39.00% - Recall@1: 34.03% - Precision@3: 20.67% - Recall@3: 54.07% - Precision@5: 15.00% - Recall@5: 64.27% - Precision@10: 8.90% - Recall@10: 75.97% - MRR@10: 0.5004 - NDCG@10: 0.5513 - Aggregated for Score Function: dot - Accuracy@1: 39.00% - Accuracy@3: 57.00% - Accuracy@5: 66.00% - Accuracy@10: 77.00% - Precision@1: 39.00% - Recall@1: 34.03% - Precision@3: 20.67% - Recall@3: 54.07% - Precision@5: 15.00% - Recall@5: 64.27% - Precision@10: 8.90% - Recall@10: 75.97% - MRR@10: 0.5004 - NDCG@10: 0.5513 + Accuracy@1: 66.00% + Accuracy@3: 86.00% + Accuracy@5: 89.00% + Accuracy@10: 94.00% + Precision@1: 66.00% + Recall@1: 60.87% + Precision@3: 32.67% + Recall@3: 84.10% + Precision@5: 20.80% + Recall@5: 87.97% + Precision@10: 11.40% + Recall@10: 94.00% + MRR@10: 0.7694 + NDCG@10: 0.8085 ''' - logger.info(evaluator.primary_metric) - # => "cosine_ndcg@10" - logger.info(results["mean"][evaluator.primary_metric]) - # => 0.5512516989358924 + print(evaluator.primary_metric) + # => "NanoBEIR_mean_cosine_ndcg@10" + print(results[evaluator.primary_metric]) + # => 0.8084508771660436 """ def __init__( From 1cb196ad3a4dd3575eaba956af8b29c89f8a7c0d Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Wed, 6 Nov 2024 09:20:14 +0100 Subject: [PATCH 3/3] [`deprecate`] Drop Python 3.8 support due to EOL (#3033) * Drop Python 3.8 support due to EOL * Apply ruff improvements due to Python 3.8 no longer being supported --- .github/workflows/tests.yml | 2 +- README.md | 2 +- docs/installation.md | 2 +- .../training/matryoshka/matryoshka_eval_stsb.py | 16 ++++++++-------- .../quora_duplicate_questions/create_splits.py | 8 +++++--- .../CT/train_ct_from_file.py | 6 +++--- .../train_ct-improved_from_file.py | 6 +++--- examples/unsupervised_learning/MLM/train_mlm.py | 16 ++++++++++------ .../SimCSE/train_simcse_from_file.py | 6 +++--- .../TSDAE/train_tsdae_from_file.py | 6 +++--- index.rst | 2 +- pyproject.toml | 3 +-- sentence_transformers/SentenceTransformer.py | 3 ++- .../datasets/ParallelSentencesDataset.py | 8 +++++--- .../evaluation/InformationRetrievalEvaluator.py | 6 ++++-- sentence_transformers/evaluation/MSEEvaluator.py | 6 ++++-- .../evaluation/MSEEvaluatorFromDataFrame.py | 6 ++++-- .../evaluation/SequentialEvaluator.py | 3 ++- sentence_transformers/fit_mixin.py | 3 ++- .../losses/AdaptiveLayerLoss.py | 3 ++- .../losses/BatchAllTripletLoss.py | 2 +- .../losses/BatchHardSoftMarginTripletLoss.py | 2 +- .../losses/BatchHardTripletLoss.py | 2 +- .../losses/BatchSemiHardTripletLoss.py | 2 +- .../losses/CachedGISTEmbedLoss.py | 3 ++- .../losses/CachedMultipleNegativesRankingLoss.py | 3 ++- ...achedMultipleNegativesSymmetricRankingLoss.py | 3 ++- sentence_transformers/losses/CoSENTLoss.py | 3 ++- sentence_transformers/losses/ContrastiveLoss.py | 3 ++- .../losses/ContrastiveTensionLoss.py | 2 +- .../losses/CosineSimilarityLoss.py | 3 ++- .../losses/DenoisingAutoEncoderLoss.py | 2 +- sentence_transformers/losses/GISTEmbedLoss.py | 3 ++- sentence_transformers/losses/MSELoss.py | 2 +- sentence_transformers/losses/MarginMSELoss.py | 2 +- sentence_transformers/losses/MatryoshkaLoss.py | 3 ++- .../losses/MegaBatchMarginLoss.py | 2 +- .../losses/MultipleNegativesRankingLoss.py | 3 ++- .../MultipleNegativesSymmetricRankingLoss.py | 3 ++- .../losses/OnlineContrastiveLoss.py | 2 +- sentence_transformers/losses/SoftmaxLoss.py | 3 ++- sentence_transformers/losses/TripletLoss.py | 3 ++- sentence_transformers/model_card_template.md | 2 +- sentence_transformers/models/Asym.py | 3 +-- sentence_transformers/models/WordEmbeddings.py | 8 +++++--- .../models/tokenizer/PhraseTokenizer.py | 2 +- .../models/tokenizer/WhitespaceTokenizer.py | 2 +- .../models/tokenizer/WordTokenizer.py | 2 +- sentence_transformers/readers/STSDataReader.py | 8 +++++--- sentence_transformers/sampler.py | 3 ++- tests/test_cross_encoder.py | 2 +- tests/test_sentence_transformer.py | 6 +++--- tests/test_train_stsb.py | 2 +- tests/utils.py | 2 +- 54 files changed, 122 insertions(+), 89 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 12c619375..6b5587825 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,7 +17,7 @@ jobs: name: Run unit tests strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12'] os: [ubuntu-latest, windows-latest] fail-fast: false runs-on: ${{ matrix.os }} diff --git a/README.md b/README.md index b0c677b03..caf7efbf4 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ For the **full documentation**, see **[www.SBERT.net](https://www.sbert.net)**. ## Installation -We recommend **Python 3.8+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.34.0+](https://github.com/huggingface/transformers)**. +We recommend **Python 3.9+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.34.0+](https://github.com/huggingface/transformers)**. **Install with pip** diff --git a/docs/installation.md b/docs/installation.md index 1e014165d..334b8cbd8 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,6 +1,6 @@ # Installation -We recommend **Python 3.8+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.41.0+](https://github.com/huggingface/transformers)**. There are 5 extra options to install Sentence Transformers: +We recommend **Python 3.9+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.41.0+](https://github.com/huggingface/transformers)**. There are 5 extra options to install Sentence Transformers: * **Default:** This allows for loading, saving, and inference (i.e., getting embeddings) of models. * **ONNX:** This allows for loading, saving, inference, optimizing, and quantizing of models using the ONNX backend. * **OpenVINO:** This allows for loading, saving, and inference of models using the OpenVINO backend. diff --git a/examples/training/matryoshka/matryoshka_eval_stsb.py b/examples/training/matryoshka/matryoshka_eval_stsb.py index 7f30f13f5..7e874eabd 100644 --- a/examples/training/matryoshka/matryoshka_eval_stsb.py +++ b/examples/training/matryoshka/matryoshka_eval_stsb.py @@ -5,7 +5,7 @@ import argparse import os -from typing import Dict, List, Optional, Tuple, cast +from typing import Optional, cast import matplotlib.pyplot as plt import numpy as np @@ -21,7 +21,7 @@ # Dimension plot def _grouped_barplot_ratios( - group_name_to_x_to_y: Dict[str, Dict[int, float]], ax: Optional[plt.Axes] = None + group_name_to_x_to_y: dict[str, dict[int, float]], ax: Optional[plt.Axes] = None ) -> plt.Axes: # To save a pandas dependency, do from scratch in matplotlib if ax is None: @@ -72,9 +72,9 @@ def _grouped_barplot_ratios( def plot_across_dimensions( - model_name_to_dim_to_score: Dict[str, Dict[int, float]], + model_name_to_dim_to_score: dict[str, dict[int, float]], filename: str, - figsize: Tuple[float, float] = (7, 7), + figsize: tuple[float, float] = (7, 7), title: str = "STSB test score for various embedding dimensions (via truncation),\nwith and without Matryoshka loss", ) -> None: # Sort each by key @@ -139,8 +139,8 @@ def plot_across_dimensions( args = parser.parse_args() plot_filename: str = args.plot_filename - model_names: List[str] = args.model_names - DIMENSIONS: List[int] = args.dimensions + model_names: list[str] = args.model_names + DIMENSIONS: list[int] = args.dimensions # Load STSb stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test") @@ -153,10 +153,10 @@ def plot_across_dimensions( ) # Run test_evaluator - model_name_to_dim_to_score: Dict[str, Dict[int, float]] = {} + model_name_to_dim_to_score: dict[str, dict[int, float]] = {} for model_name in tqdm(model_names, desc="Evaluating models"): model = SentenceTransformer(model_name) - dim_to_score: Dict[int, float] = {} + dim_to_score: dict[int, float] = {} for dim in tqdm(DIMENSIONS, desc=f"Evaluating {model_name}"): output_path = os.path.join(model_name, f"dim-{dim}") os.makedirs(output_path) diff --git a/examples/training/quora_duplicate_questions/create_splits.py b/examples/training/quora_duplicate_questions/create_splits.py index 777c42ba7..1c6ae3331 100644 --- a/examples/training/quora_duplicate_questions/create_splits.py +++ b/examples/training/quora_duplicate_questions/create_splits.py @@ -481,9 +481,11 @@ def write_mining_files(name, ids, dups): ###### Classification dataset ##### -with open("quora-IR-dataset/classification/train_pairs.tsv", "w", encoding="utf8") as fOutTrain, open( - "quora-IR-dataset/classification/dev_pairs.tsv", "w", encoding="utf8" -) as fOutDev, open("quora-IR-dataset/classification/test_pairs.tsv", "w", encoding="utf8") as fOutTest: +with ( + open("quora-IR-dataset/classification/train_pairs.tsv", "w", encoding="utf8") as fOutTrain, + open("quora-IR-dataset/classification/dev_pairs.tsv", "w", encoding="utf8") as fOutDev, + open("quora-IR-dataset/classification/test_pairs.tsv", "w", encoding="utf8") as fOutTest, +): fOutTrain.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n") fOutDev.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n") fOutTest.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n") diff --git a/examples/unsupervised_learning/CT/train_ct_from_file.py b/examples/unsupervised_learning/CT/train_ct_from_file.py index 15eb57a79..0dc6e01f9 100644 --- a/examples/unsupervised_learning/CT/train_ct_from_file.py +++ b/examples/unsupervised_learning/CT/train_ct_from_file.py @@ -55,9 +55,9 @@ ################# Read the train corpus ################# train_sentences = [] -with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open( - filepath, encoding="utf8" -) as fIn: +with ( + gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(filepath, encoding="utf8") as fIn +): for line in tqdm.tqdm(fIn, desc="Read file"): line = line.strip() if len(line) >= 10: diff --git a/examples/unsupervised_learning/CT_In-Batch_Negatives/train_ct-improved_from_file.py b/examples/unsupervised_learning/CT_In-Batch_Negatives/train_ct-improved_from_file.py index 76975b231..3b7115245 100644 --- a/examples/unsupervised_learning/CT_In-Batch_Negatives/train_ct-improved_from_file.py +++ b/examples/unsupervised_learning/CT_In-Batch_Negatives/train_ct-improved_from_file.py @@ -55,9 +55,9 @@ ################# Read the train corpus ################# train_sentences = [] -with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open( - filepath, encoding="utf8" -) as fIn: +with ( + gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(filepath, encoding="utf8") as fIn +): for line in tqdm.tqdm(fIn, desc="Read file"): line = line.strip() if len(line) >= 10: diff --git a/examples/unsupervised_learning/MLM/train_mlm.py b/examples/unsupervised_learning/MLM/train_mlm.py index f51a9f265..1fada3db9 100644 --- a/examples/unsupervised_learning/MLM/train_mlm.py +++ b/examples/unsupervised_learning/MLM/train_mlm.py @@ -48,9 +48,11 @@ train_sentences = [] train_path = sys.argv[2] -with gzip.open(train_path, "rt", encoding="utf8") if train_path.endswith(".gz") else open( - train_path, encoding="utf8" -) as fIn: +with ( + gzip.open(train_path, "rt", encoding="utf8") + if train_path.endswith(".gz") + else open(train_path, encoding="utf8") as fIn +): for line in fIn: line = line.strip() if len(line) >= 10: @@ -61,9 +63,11 @@ dev_sentences = [] if len(sys.argv) >= 4: dev_path = sys.argv[3] - with gzip.open(dev_path, "rt", encoding="utf8") if dev_path.endswith(".gz") else open( - dev_path, encoding="utf8" - ) as fIn: + with ( + gzip.open(dev_path, "rt", encoding="utf8") + if dev_path.endswith(".gz") + else open(dev_path, encoding="utf8") as fIn + ): for line in fIn: line = line.strip() if len(line) >= 10: diff --git a/examples/unsupervised_learning/SimCSE/train_simcse_from_file.py b/examples/unsupervised_learning/SimCSE/train_simcse_from_file.py index 5df7b41de..4191a9eb5 100644 --- a/examples/unsupervised_learning/SimCSE/train_simcse_from_file.py +++ b/examples/unsupervised_learning/SimCSE/train_simcse_from_file.py @@ -55,9 +55,9 @@ ################# Read the train corpus ################# train_samples = [] -with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open( - filepath, encoding="utf8" -) as fIn: +with ( + gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(filepath, encoding="utf8") as fIn +): for line in tqdm.tqdm(fIn, desc="Read file"): line = line.strip() if len(line) >= 10: diff --git a/examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py b/examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py index 2ff07664b..13cc5eaef 100644 --- a/examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py +++ b/examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py @@ -45,9 +45,9 @@ ################# Read the train corpus ################# train_sentences = [] -with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open( - filepath, encoding="utf8" -) as fIn: +with ( + gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(filepath, encoding="utf8") as fIn +): for line in tqdm.tqdm(fIn, desc="Read file"): line = line.strip() if len(line) >= 10: diff --git a/index.rst b/index.rst index 3c5b1cc1f..b1f3b3843 100644 --- a/index.rst +++ b/index.rst @@ -28,7 +28,7 @@ Using Sentence Transformer models is elementary: pip install -U sentence-transformers - We recommend **Python 3.8+** and **PyTorch 1.11.0+**. See `installation `_ for further installation options. + We recommend **Python 3.9+** and **PyTorch 1.11.0+**. See `installation `_ for further installation options. .. code-block:: python diff --git a/pyproject.toml b/pyproject.toml index 11550a5e7..a5073ab62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ maintainers = [ { name = "Tom Aarsen", email = "tom.aarsen@huggingface.co" } ] -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = [ "Transformer Networks", "BERT", @@ -25,7 +25,6 @@ classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py index fae45a366..966720a25 100644 --- a/sentence_transformers/SentenceTransformer.py +++ b/sentence_transformers/SentenceTransformer.py @@ -13,10 +13,11 @@ import traceback import warnings from collections import OrderedDict +from collections.abc import Iterable, Iterator from contextlib import contextmanager from multiprocessing import Queue from pathlib import Path -from typing import Any, Callable, Iterable, Iterator, Literal, overload +from typing import Any, Callable, Literal, overload import numpy as np import torch diff --git a/sentence_transformers/datasets/ParallelSentencesDataset.py b/sentence_transformers/datasets/ParallelSentencesDataset.py index 397356f8e..d43473328 100644 --- a/sentence_transformers/datasets/ParallelSentencesDataset.py +++ b/sentence_transformers/datasets/ParallelSentencesDataset.py @@ -77,9 +77,11 @@ def load_data( logger.info("Load " + filepath) parallel_sentences = [] - with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open( - filepath, encoding="utf8" - ) as fIn: + with ( + gzip.open(filepath, "rt", encoding="utf8") + if filepath.endswith(".gz") + else open(filepath, encoding="utf8") as fIn + ): count = 0 for line in fIn: sentences = line.strip().split("\t") diff --git a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py index 738d46f0b..04aed142f 100644 --- a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py +++ b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py @@ -317,8 +317,10 @@ def compute_metrices( # Encode chunk of corpus if corpus_embeddings is None: - with nullcontext() if self.truncate_dim is None else corpus_model.truncate_sentence_embeddings( - self.truncate_dim + with ( + nullcontext() + if self.truncate_dim is None + else corpus_model.truncate_sentence_embeddings(self.truncate_dim) ): sub_corpus_embeddings = corpus_model.encode( self.corpus[corpus_start_idx:corpus_end_idx], diff --git a/sentence_transformers/evaluation/MSEEvaluator.py b/sentence_transformers/evaluation/MSEEvaluator.py index a2a77f715..c9f270f4d 100644 --- a/sentence_transformers/evaluation/MSEEvaluator.py +++ b/sentence_transformers/evaluation/MSEEvaluator.py @@ -81,8 +81,10 @@ def __init__( ): super().__init__() self.truncate_dim = truncate_dim - with nullcontext() if self.truncate_dim is None else teacher_model.truncate_sentence_embeddings( - self.truncate_dim + with ( + nullcontext() + if self.truncate_dim is None + else teacher_model.truncate_sentence_embeddings(self.truncate_dim) ): self.source_embeddings = teacher_model.encode( source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True diff --git a/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py b/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py index 5ac715716..cae8a8ddd 100644 --- a/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py +++ b/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py @@ -79,8 +79,10 @@ def __init__( self.csv_headers.append(f"{src_lang}-{trg_lang}") all_source_sentences = list(all_source_sentences) - with nullcontext() if self.truncate_dim is None else teacher_model.truncate_sentence_embeddings( - self.truncate_dim + with ( + nullcontext() + if self.truncate_dim is None + else teacher_model.truncate_sentence_embeddings(self.truncate_dim) ): all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size) self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)} diff --git a/sentence_transformers/evaluation/SequentialEvaluator.py b/sentence_transformers/evaluation/SequentialEvaluator.py index 39711a739..062d0a664 100644 --- a/sentence_transformers/evaluation/SequentialEvaluator.py +++ b/sentence_transformers/evaluation/SequentialEvaluator.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Iterable +from collections.abc import Iterable +from typing import TYPE_CHECKING from sentence_transformers.evaluation.SentenceEvaluator import SentenceEvaluator diff --git a/sentence_transformers/fit_mixin.py b/sentence_transformers/fit_mixin.py index 4ddabb7c3..e4cc4b046 100644 --- a/sentence_transformers/fit_mixin.py +++ b/sentence_transformers/fit_mixin.py @@ -4,8 +4,9 @@ import logging import os import shutil +from collections.abc import Iterable from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Iterable +from typing import TYPE_CHECKING, Any, Callable import numpy as np import torch diff --git a/sentence_transformers/losses/AdaptiveLayerLoss.py b/sentence_transformers/losses/AdaptiveLayerLoss.py index a65416e2f..8eb9cf4ed 100644 --- a/sentence_transformers/losses/AdaptiveLayerLoss.py +++ b/sentence_transformers/losses/AdaptiveLayerLoss.py @@ -2,7 +2,8 @@ import random import warnings -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/BatchAllTripletLoss.py b/sentence_transformers/losses/BatchAllTripletLoss.py index cd337c9a2..0a402d40d 100644 --- a/sentence_transformers/losses/BatchAllTripletLoss.py +++ b/sentence_transformers/losses/BatchAllTripletLoss.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable from torch import Tensor, nn diff --git a/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py b/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py index db381622c..1f4b99a66 100644 --- a/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py +++ b/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable import torch from torch import Tensor diff --git a/sentence_transformers/losses/BatchHardTripletLoss.py b/sentence_transformers/losses/BatchHardTripletLoss.py index c407694eb..abf947c46 100644 --- a/sentence_transformers/losses/BatchHardTripletLoss.py +++ b/sentence_transformers/losses/BatchHardTripletLoss.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/BatchSemiHardTripletLoss.py b/sentence_transformers/losses/BatchSemiHardTripletLoss.py index e29a65b95..1e05c48e8 100644 --- a/sentence_transformers/losses/BatchSemiHardTripletLoss.py +++ b/sentence_transformers/losses/BatchSemiHardTripletLoss.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/CachedGISTEmbedLoss.py b/sentence_transformers/losses/CachedGISTEmbedLoss.py index aa83c59e8..e8e131582 100644 --- a/sentence_transformers/losses/CachedGISTEmbedLoss.py +++ b/sentence_transformers/losses/CachedGISTEmbedLoss.py @@ -1,8 +1,9 @@ from __future__ import annotations +from collections.abc import Iterable, Iterator from contextlib import nullcontext from functools import partial -from typing import Any, Iterable, Iterator +from typing import Any import torch import tqdm diff --git a/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py b/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py index 9c787fe8b..544c73891 100644 --- a/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py +++ b/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py @@ -1,8 +1,9 @@ from __future__ import annotations +from collections.abc import Iterable, Iterator from contextlib import nullcontext from functools import partial -from typing import Any, Iterable, Iterator +from typing import Any import torch import tqdm diff --git a/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py b/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py index ac82d133f..be77b1fbf 100644 --- a/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py +++ b/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py @@ -1,8 +1,9 @@ from __future__ import annotations +from collections.abc import Iterable, Iterator from contextlib import nullcontext from functools import partial -from typing import Any, Iterable, Iterator +from typing import Any import torch import tqdm diff --git a/sentence_transformers/losses/CoSENTLoss.py b/sentence_transformers/losses/CoSENTLoss.py index 00b304695..a07dbd835 100644 --- a/sentence_transformers/losses/CoSENTLoss.py +++ b/sentence_transformers/losses/CoSENTLoss.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/ContrastiveLoss.py b/sentence_transformers/losses/ContrastiveLoss.py index 6fdb02180..e69532abf 100644 --- a/sentence_transformers/losses/ContrastiveLoss.py +++ b/sentence_transformers/losses/ContrastiveLoss.py @@ -1,7 +1,8 @@ from __future__ import annotations +from collections.abc import Iterable from enum import Enum -from typing import Any, Iterable +from typing import Any import torch.nn.functional as F from torch import Tensor, nn diff --git a/sentence_transformers/losses/ContrastiveTensionLoss.py b/sentence_transformers/losses/ContrastiveTensionLoss.py index 7af82c253..d0f695e1e 100644 --- a/sentence_transformers/losses/ContrastiveTensionLoss.py +++ b/sentence_transformers/losses/ContrastiveTensionLoss.py @@ -3,7 +3,7 @@ import copy import math import random -from typing import Iterable +from collections.abc import Iterable import numpy as np import torch diff --git a/sentence_transformers/losses/CosineSimilarityLoss.py b/sentence_transformers/losses/CosineSimilarityLoss.py index 5a769a588..980cb47cd 100644 --- a/sentence_transformers/losses/CosineSimilarityLoss.py +++ b/sentence_transformers/losses/CosineSimilarityLoss.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/DenoisingAutoEncoderLoss.py b/sentence_transformers/losses/DenoisingAutoEncoderLoss.py index 8f38342d7..5c9b2a30e 100644 --- a/sentence_transformers/losses/DenoisingAutoEncoderLoss.py +++ b/sentence_transformers/losses/DenoisingAutoEncoderLoss.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Iterable +from collections.abc import Iterable from torch import Tensor, nn from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel diff --git a/sentence_transformers/losses/GISTEmbedLoss.py b/sentence_transformers/losses/GISTEmbedLoss.py index 51958da5e..5111b2b0c 100644 --- a/sentence_transformers/losses/GISTEmbedLoss.py +++ b/sentence_transformers/losses/GISTEmbedLoss.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/MSELoss.py b/sentence_transformers/losses/MSELoss.py index 02ec3a568..c94b7ffbc 100644 --- a/sentence_transformers/losses/MSELoss.py +++ b/sentence_transformers/losses/MSELoss.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/MarginMSELoss.py b/sentence_transformers/losses/MarginMSELoss.py index d11f9bd19..efe72efaa 100644 --- a/sentence_transformers/losses/MarginMSELoss.py +++ b/sentence_transformers/losses/MarginMSELoss.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable from torch import Tensor, nn diff --git a/sentence_transformers/losses/MatryoshkaLoss.py b/sentence_transformers/losses/MatryoshkaLoss.py index 997e7be0b..557574262 100644 --- a/sentence_transformers/losses/MatryoshkaLoss.py +++ b/sentence_transformers/losses/MatryoshkaLoss.py @@ -2,7 +2,8 @@ import random import warnings -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any import torch.nn.functional as F from torch import Tensor, nn diff --git a/sentence_transformers/losses/MegaBatchMarginLoss.py b/sentence_transformers/losses/MegaBatchMarginLoss.py index 22dbbe5ea..38873cebf 100644 --- a/sentence_transformers/losses/MegaBatchMarginLoss.py +++ b/sentence_transformers/losses/MegaBatchMarginLoss.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable import torch import torch.nn.functional as F diff --git a/sentence_transformers/losses/MultipleNegativesRankingLoss.py b/sentence_transformers/losses/MultipleNegativesRankingLoss.py index 6cc64d12a..1aea7acfa 100644 --- a/sentence_transformers/losses/MultipleNegativesRankingLoss.py +++ b/sentence_transformers/losses/MultipleNegativesRankingLoss.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/MultipleNegativesSymmetricRankingLoss.py b/sentence_transformers/losses/MultipleNegativesSymmetricRankingLoss.py index a512f5404..7a2e488ff 100644 --- a/sentence_transformers/losses/MultipleNegativesSymmetricRankingLoss.py +++ b/sentence_transformers/losses/MultipleNegativesSymmetricRankingLoss.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any import torch from torch import Tensor, nn diff --git a/sentence_transformers/losses/OnlineContrastiveLoss.py b/sentence_transformers/losses/OnlineContrastiveLoss.py index e9fbb28bc..c560df8b3 100644 --- a/sentence_transformers/losses/OnlineContrastiveLoss.py +++ b/sentence_transformers/losses/OnlineContrastiveLoss.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable import torch.nn.functional as F from torch import Tensor, nn diff --git a/sentence_transformers/losses/SoftmaxLoss.py b/sentence_transformers/losses/SoftmaxLoss.py index 48a30c452..deb2faa3a 100644 --- a/sentence_transformers/losses/SoftmaxLoss.py +++ b/sentence_transformers/losses/SoftmaxLoss.py @@ -1,7 +1,8 @@ from __future__ import annotations import logging -from typing import Callable, Iterable +from collections.abc import Iterable +from typing import Callable import torch import transformers diff --git a/sentence_transformers/losses/TripletLoss.py b/sentence_transformers/losses/TripletLoss.py index 874937d82..73d87540e 100644 --- a/sentence_transformers/losses/TripletLoss.py +++ b/sentence_transformers/losses/TripletLoss.py @@ -1,7 +1,8 @@ from __future__ import annotations +from collections.abc import Iterable from enum import Enum -from typing import Any, Iterable +from typing import Any import torch.nn.functional as F from torch import Tensor, nn diff --git a/sentence_transformers/model_card_template.md b/sentence_transformers/model_card_template.md index db8e4d451..ae428d3fe 100644 --- a/sentence_transformers/model_card_template.md +++ b/sentence_transformers/model_card_template.md @@ -242,4 +242,4 @@ Carbon emissions were measured using [CodeCarbon](https://github.com/mlco2/codec ## Model Card Contact *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.* ---> \ No newline at end of file +--> diff --git a/sentence_transformers/models/Asym.py b/sentence_transformers/models/Asym.py index 4191fe0bc..9549bffc0 100644 --- a/sentence_transformers/models/Asym.py +++ b/sentence_transformers/models/Asym.py @@ -3,7 +3,6 @@ import json import os from collections import OrderedDict -from typing import List from torch import Tensor, nn @@ -45,7 +44,7 @@ def __init__(self, sub_modules: dict[str, list[nn.Module]], allow_empty_key: boo ordered_dict = OrderedDict() for name, models in sub_modules.items(): - if not isinstance(models, List): + if not isinstance(models, list): models = [models] for idx, model in enumerate(models): diff --git a/sentence_transformers/models/WordEmbeddings.py b/sentence_transformers/models/WordEmbeddings.py index 50697fd97..ed6b756aa 100644 --- a/sentence_transformers/models/WordEmbeddings.py +++ b/sentence_transformers/models/WordEmbeddings.py @@ -137,9 +137,11 @@ def from_text_file( vocab = [] embeddings = [] - with gzip.open(embeddings_file_path, "rt", encoding="utf8") if embeddings_file_path.endswith(".gz") else open( - embeddings_file_path, encoding="utf8" - ) as fIn: + with ( + gzip.open(embeddings_file_path, "rt", encoding="utf8") + if embeddings_file_path.endswith(".gz") + else open(embeddings_file_path, encoding="utf8") as fIn + ): iterator = tqdm(fIn, desc="Load Word Embeddings", unit="Embeddings") for line in iterator: split = line.rstrip().split(item_separator) diff --git a/sentence_transformers/models/tokenizer/PhraseTokenizer.py b/sentence_transformers/models/tokenizer/PhraseTokenizer.py index 1e4428c9c..0a120b159 100644 --- a/sentence_transformers/models/tokenizer/PhraseTokenizer.py +++ b/sentence_transformers/models/tokenizer/PhraseTokenizer.py @@ -5,7 +5,7 @@ import logging import os import string -from typing import Iterable +from collections.abc import Iterable from transformers.utils.import_utils import NLTK_IMPORT_ERROR, is_nltk_available diff --git a/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py b/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py index 4d4eb5765..1780538e8 100644 --- a/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py +++ b/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py @@ -4,7 +4,7 @@ import json import os import string -from typing import Iterable +from collections.abc import Iterable from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer diff --git a/sentence_transformers/models/tokenizer/WordTokenizer.py b/sentence_transformers/models/tokenizer/WordTokenizer.py index 15796ddd5..a64d02077 100644 --- a/sentence_transformers/models/tokenizer/WordTokenizer.py +++ b/sentence_transformers/models/tokenizer/WordTokenizer.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Iterable +from collections.abc import Iterable ENGLISH_STOP_WORDS = [ "!", diff --git a/sentence_transformers/readers/STSDataReader.py b/sentence_transformers/readers/STSDataReader.py index 61a0011f1..6fc754600 100644 --- a/sentence_transformers/readers/STSDataReader.py +++ b/sentence_transformers/readers/STSDataReader.py @@ -38,9 +38,11 @@ def __init__( def get_examples(self, filename, max_examples=0): """filename specified which data split to use (train.csv, dev.csv, test.csv).""" filepath = os.path.join(self.dataset_folder, filename) - with gzip.open(filepath, "rt", encoding="utf8") if filename.endswith(".gz") else open( - filepath, encoding="utf-8" - ) as fIn: + with ( + gzip.open(filepath, "rt", encoding="utf8") + if filename.endswith(".gz") + else open(filepath, encoding="utf-8") as fIn + ): data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting) examples = [] for id, row in enumerate(data): diff --git a/sentence_transformers/sampler.py b/sentence_transformers/sampler.py index 615c72f99..f78623c5a 100644 --- a/sentence_transformers/sampler.py +++ b/sentence_transformers/sampler.py @@ -2,8 +2,9 @@ import logging from collections import defaultdict +from collections.abc import Iterator from itertools import accumulate, cycle -from typing import Any, Iterator +from typing import Any import torch from torch.utils.data import BatchSampler, ConcatDataset, SubsetRandomSampler diff --git a/tests/test_cross_encoder.py b/tests/test_cross_encoder.py index 2e148fc97..9c17aa093 100644 --- a/tests/test_cross_encoder.py +++ b/tests/test_cross_encoder.py @@ -7,8 +7,8 @@ import csv import gzip import os +from collections.abc import Generator from pathlib import Path -from typing import Generator import numpy as np import pytest diff --git a/tests/test_sentence_transformer.py b/tests/test_sentence_transformer.py index c3aaee807..3253bf2f9 100644 --- a/tests/test_sentence_transformer.py +++ b/tests/test_sentence_transformer.py @@ -10,7 +10,7 @@ import re from functools import partial from pathlib import Path -from typing import Dict, List, Literal, cast +from typing import Literal, cast import numpy as np import pytest @@ -492,10 +492,10 @@ def test(model: SentenceTransformer, expected_dim: int): # Extract the sentence embeddings out of outputs if output_value is None: # We get the whole plate - if not isinstance(outputs, List): + if not isinstance(outputs, list): embeddings = outputs["sentence_embedding"] else: - outputs = cast(List[Dict[str, torch.Tensor]], outputs) + outputs = cast(list[dict[str, torch.Tensor]], outputs) embeddings = [out_features["sentence_embedding"] for out_features in outputs] else: embeddings = outputs diff --git a/tests/test_train_stsb.py b/tests/test_train_stsb.py index 0414b9128..c1c5dd46e 100644 --- a/tests/test_train_stsb.py +++ b/tests/test_train_stsb.py @@ -7,7 +7,7 @@ import csv import gzip import os -from typing import Generator +from collections.abc import Generator import pytest import torch diff --git a/tests/utils.py b/tests/utils.py index b91059b13..658a6e188 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,7 +8,7 @@ class SafeTemporaryDirectory(tempfile.TemporaryDirectory): The GitHub Actions CI on Windows sometimes raises a NotADirectoryError when cleaning up the temporary directory. This class is a workaround to avoid the error. - Unlike tempfile.TemporaryDirectory(ignore_cleanup_errors=True), this also works on Python 3.8 and 3.9. + Unlike tempfile.TemporaryDirectory(ignore_cleanup_errors=True), this also works on Python 3.9. """ def __init__(self, *args, **kwargs) -> None: