From f97657d7daecc172da7c42bb9c0ccbbe11060e55 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 10 Oct 2024 20:00:01 +0200 Subject: [PATCH 01/12] Increment dev version --- pyproject.toml | 2 +- sentence_transformers/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8771d81a5..830d6bb8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sentence-transformers" -version = "3.2.0.dev0" +version = "3.3.0.dev0" description = "State-of-the-Art Text Embeddings" license = { text = "Apache 2.0" } readme = "README.md" diff --git a/sentence_transformers/__init__.py b/sentence_transformers/__init__.py index 2c382bdb9..1ba4558e8 100644 --- a/sentence_transformers/__init__.py +++ b/sentence_transformers/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -__version__ = "3.2.0.dev0" +__version__ = "3.3.0.dev0" __MODEL_HUB_ORGANIZATION__ = "sentence-transformers" import importlib From a4be00f3fcb635f536566044d40c41513a495818 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 15 Oct 2024 09:40:20 +0200 Subject: [PATCH 02/12] Bump optimum version (#2984) --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 830d6bb8c..11550a5e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,8 +49,8 @@ Repository = "https://github.com/UKPLab/sentence-transformers/" [project.optional-dependencies] train = ["datasets", "accelerate>=0.20.3"] -onnx = ["optimum[onnxruntime]>=1.23.0"] -onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.0"] +onnx = ["optimum[onnxruntime]>=1.23.1"] +onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.1"] openvino = ["optimum-intel[openvino]>=1.20.0"] dev = ["datasets", "accelerate>=0.20.3", "pre-commit", "pytest", "pytest-cov"] @@ -100,4 +100,4 @@ testpaths = [ addopts = "--strict-markers -m 'not slow'" markers = [ "slow: marks tests as slow" -] \ No newline at end of file +] From a1db32df6b209d99c5bb5412c3e1a28f039c8e6b Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 15 Oct 2024 12:10:27 +0200 Subject: [PATCH 03/12] [`docs`] Update the training snippets for some losses that should use the v3 Trainer (#2987) --- .../losses/Matryoshka2dLoss.py | 26 ++++++++------- .../losses/MatryoshkaLoss.py | 26 ++++++++------- .../losses/MegaBatchMarginLoss.py | 33 +++++++++++-------- 3 files changed, 47 insertions(+), 38 deletions(-) diff --git a/sentence_transformers/losses/Matryoshka2dLoss.py b/sentence_transformers/losses/Matryoshka2dLoss.py index 4b77b9c74..7c85884d5 100644 --- a/sentence_transformers/losses/Matryoshka2dLoss.py +++ b/sentence_transformers/losses/Matryoshka2dLoss.py @@ -95,21 +95,23 @@ def __init__( Example: :: - from sentence_transformers import SentenceTransformer, losses, InputExample - from torch.utils.data import DataLoader + from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses + from datasets import Dataset model = SentenceTransformer("microsoft/mpnet-base") - train_examples = [ - InputExample(texts=['Anchor 1', 'Positive 1']), - InputExample(texts=['Anchor 2', 'Positive 2']), - ] - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32) - train_loss = losses.MultipleNegativesRankingLoss(model=model) - train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64]) - model.fit( - [(train_dataloader, train_loss)], - epochs=10, + train_dataset = Dataset.from_dict({ + "anchor": ["It's nice weather outside today.", "He drove to work."], + "positive": ["It's so sunny.", "He took the car to the office."], + }) + loss = losses.MultipleNegativesRankingLoss(model) + loss = losses.Matryoshka2dLoss(model, loss, [768, 512, 256, 128, 64]) + + trainer = SentenceTransformerTrainer( + model=model, + train_dataset=train_dataset, + loss=loss, ) + trainer.train() """ matryoshka_loss = MatryoshkaLoss( model, diff --git a/sentence_transformers/losses/MatryoshkaLoss.py b/sentence_transformers/losses/MatryoshkaLoss.py index e6a18aac0..997e7be0b 100644 --- a/sentence_transformers/losses/MatryoshkaLoss.py +++ b/sentence_transformers/losses/MatryoshkaLoss.py @@ -101,21 +101,23 @@ def __init__( Example: :: - from sentence_transformers import SentenceTransformer, losses, InputExample - from torch.utils.data import DataLoader + from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses + from datasets import Dataset model = SentenceTransformer("microsoft/mpnet-base") - train_examples = [ - InputExample(texts=['Anchor 1', 'Positive 1']), - InputExample(texts=['Anchor 2', 'Positive 2']), - ] - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32) - train_loss = losses.MultipleNegativesRankingLoss(model=model) - train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64]) - model.fit( - [(train_dataloader, train_loss)], - epochs=10, + train_dataset = Dataset.from_dict({ + "anchor": ["It's nice weather outside today.", "He drove to work."], + "positive": ["It's so sunny.", "He took the car to the office."], + }) + loss = losses.MultipleNegativesRankingLoss(model) + loss = losses.MatryoshkaLoss(model, loss, [768, 512, 256, 128, 64]) + + trainer = SentenceTransformerTrainer( + model=model, + train_dataset=train_dataset, + loss=loss, ) + trainer.train() """ super().__init__() self.model = model diff --git a/sentence_transformers/losses/MegaBatchMarginLoss.py b/sentence_transformers/losses/MegaBatchMarginLoss.py index a964eb726..22dbbe5ea 100644 --- a/sentence_transformers/losses/MegaBatchMarginLoss.py +++ b/sentence_transformers/losses/MegaBatchMarginLoss.py @@ -59,25 +59,30 @@ def __init__( Example: :: - from sentence_transformers import SentenceTransformer, InputExample, losses - from torch.utils.data import DataLoader + from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer, losses + from datasets import Dataset - model = SentenceTransformer('all-MiniLM-L6-v2') - - total_examples = 500 train_batch_size = 250 train_mini_batch_size = 32 - train_examples = [ - InputExample(texts=[f"This is sentence number {i}", f"This is sentence number {i+1}"]) for i in range(total_examples) - ] - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size) - train_loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size) - - model.fit( - [(train_dataloader, train_loss)], - epochs=10, + model = SentenceTransformer('all-MiniLM-L6-v2') + train_dataset = Dataset.from_dict({ + "anchor": [f"This is sentence number {i}" for i in range(500)], + "positive": [f"This is sentence number {i}" for i in range(1, 501)], + }) + loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size) + + args = SentenceTransformerTrainingArguments( + output_dir="output", + per_device_train_batch_size=train_batch_size, + ) + trainer = SentenceTransformerTrainer( + model=model, + args=args, + train_dataset=train_dataset, + loss=loss, ) + trainer.train() """ super().__init__() self.model = model From 72d5649258263eec28e07c1572e0bcc21e74b884 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 17 Oct 2024 12:53:29 +0200 Subject: [PATCH 04/12] [`enh`] Throw error if StaticEmbedding-based model is trained with incompatible loss (#2990) --- sentence_transformers/losses/CachedGISTEmbedLoss.py | 7 ++++++- .../losses/CachedMultipleNegativesRankingLoss.py | 7 +++++++ .../losses/CachedMultipleNegativesSymmetricRankingLoss.py | 7 +++++++ sentence_transformers/losses/DenoisingAutoEncoderLoss.py | 7 +++++++ sentence_transformers/losses/GISTEmbedLoss.py | 8 +++++++- 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/sentence_transformers/losses/CachedGISTEmbedLoss.py b/sentence_transformers/losses/CachedGISTEmbedLoss.py index 5a99fa419..aa83c59e8 100644 --- a/sentence_transformers/losses/CachedGISTEmbedLoss.py +++ b/sentence_transformers/losses/CachedGISTEmbedLoss.py @@ -10,7 +10,7 @@ from torch.utils.checkpoint import get_device_states, set_device_states from sentence_transformers import SentenceTransformer -from sentence_transformers.models import Transformer +from sentence_transformers.models import StaticEmbedding, Transformer class RandContext: @@ -139,6 +139,11 @@ def __init__( trainer.train() """ super().__init__() + if isinstance(model[0], StaticEmbedding): + raise ValueError( + "CachedGISTEmbedLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. " + "Consider using GISTEmbedLoss instead." + ) self.model = model self.guide = guide self.temperature = temperature diff --git a/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py b/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py index c1e7d67c1..9c787fe8b 100644 --- a/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py +++ b/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py @@ -10,6 +10,7 @@ from torch.utils.checkpoint import get_device_states, set_device_states from sentence_transformers import SentenceTransformer, util +from sentence_transformers.models import StaticEmbedding class RandContext: @@ -145,6 +146,12 @@ def __init__( trainer.train() """ super().__init__() + if isinstance(model[0], StaticEmbedding): + raise ValueError( + "CachedMultipleNegativesRankingLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. " + "Consider using MultipleNegativesRankingLoss instead." + ) + self.model = model self.scale = scale self.similarity_fct = similarity_fct diff --git a/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py b/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py index 83fe1e06f..ac82d133f 100644 --- a/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py +++ b/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py @@ -10,6 +10,7 @@ from sentence_transformers import SentenceTransformer, util from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import RandContext +from sentence_transformers.models import StaticEmbedding def _backward_hook( @@ -114,6 +115,12 @@ def __init__( - Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup: https://arxiv.org/pdf/2101.06983.pdf """ super().__init__() + if isinstance(model[0], StaticEmbedding): + raise ValueError( + "CachedMultipleNegativesSymmetricRankingLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. " + "Consider using MultipleNegativesSymmetricRankingLoss instead." + ) + self.model = model self.scale = scale self.similarity_fct = similarity_fct diff --git a/sentence_transformers/losses/DenoisingAutoEncoderLoss.py b/sentence_transformers/losses/DenoisingAutoEncoderLoss.py index bb1cf8bef..8f38342d7 100644 --- a/sentence_transformers/losses/DenoisingAutoEncoderLoss.py +++ b/sentence_transformers/losses/DenoisingAutoEncoderLoss.py @@ -7,6 +7,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel from sentence_transformers import SentenceTransformer +from sentence_transformers.models import StaticEmbedding logger = logging.getLogger(__name__) @@ -73,6 +74,12 @@ def __init__( ) """ super().__init__() + + if isinstance(model[0], StaticEmbedding): + raise ValueError( + "DenoisingAutoEncoderLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding." + ) + self.encoder = model # This will be the final model used during the inference time. self.tokenizer_encoder = model.tokenizer diff --git a/sentence_transformers/losses/GISTEmbedLoss.py b/sentence_transformers/losses/GISTEmbedLoss.py index f1bb833bd..51958da5e 100644 --- a/sentence_transformers/losses/GISTEmbedLoss.py +++ b/sentence_transformers/losses/GISTEmbedLoss.py @@ -5,7 +5,7 @@ import torch from torch import Tensor, nn -from sentence_transformers.models import Transformer +from sentence_transformers.models import StaticEmbedding, Transformer from sentence_transformers.SentenceTransformer import SentenceTransformer @@ -91,6 +91,12 @@ def __init__( if self.must_retokenize: self.tokenizer = self.model.tokenizer + if isinstance(self.model[0], StaticEmbedding): + raise ValueError( + "If we must retokenize because the guide model has a different tokenizer, " + "then the Sentence Transformer model must not be based on a StaticEmbedding." + ) + def sim_matrix(self, embed1: Tensor, embed2: Tensor) -> Tensor: return self.similarity_fct(embed1.unsqueeze(1), embed2.unsqueeze(0)) From 1802076d4eae42ff0a5629e1b04e75785d4e193b Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 17 Oct 2024 12:53:46 +0200 Subject: [PATCH 05/12] [`fix`] Fix semantic_search_usearch with 'binary' (#2989) * Fix semantic_search_usearch with 'binary' * Add b1 support back, but with ubinary --- .../semantic_search_usearch.py | 4 ++-- sentence_transformers/quantization.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/examples/applications/embedding-quantization/semantic_search_usearch.py b/examples/applications/embedding-quantization/semantic_search_usearch.py index 03883a330..9af0e49f3 100644 --- a/examples/applications/embedding-quantization/semantic_search_usearch.py +++ b/examples/applications/embedding-quantization/semantic_search_usearch.py @@ -6,7 +6,7 @@ from sentence_transformers.quantization import quantize_embeddings, semantic_search_usearch # 1. Load the quora corpus with questions -dataset = load_dataset("quora", split="train").map( +dataset = load_dataset("quora", split="train", trust_remote_code=True).map( lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]}, batched=True, remove_columns=["questions", "is_duplicate"], @@ -26,7 +26,7 @@ # 4. Choose a target precision for the corpus embeddings corpus_precision = "binary" # Valid options are: "float32", "uint8", "int8", "ubinary", and "binary" -# But usearch only supports "float32", "int8", and "binary" +# But usearch only supports "float32", "int8", "binary" and "ubinary" # 5. Encode the corpus full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True) diff --git a/sentence_transformers/quantization.py b/sentence_transformers/quantization.py index 37402cae7..aa5be00f0 100644 --- a/sentence_transformers/quantization.py +++ b/sentence_transformers/quantization.py @@ -216,8 +216,8 @@ def semantic_search_usearch( `corpus_embeddings` or `corpus_index` should be used, not both. corpus_precision: Precision of the corpus embeddings. The - options are "float32", "int8", or "binary". Default is - "float32". + options are "float32", "int8", "ubinary" or "binary". Default + is "float32". top_k: Number of top results to retrieve. Default is 10. ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges refers to the @@ -263,8 +263,8 @@ def semantic_search_usearch( raise ValueError("Only corpus_embeddings or corpus_index should be used, not both.") if corpus_embeddings is None and corpus_index is None: raise ValueError("Either corpus_embeddings or corpus_index should be used.") - if corpus_precision not in ["float32", "int8", "binary"]: - raise ValueError('corpus_precision must be "float32", "int8", or "binary" for usearch') + if corpus_precision not in ["float32", "int8", "ubinary", "binary"]: + raise ValueError('corpus_precision must be "float32", "int8", "ubinary", "binary" for usearch') # If corpus_index is not provided, create a new index if corpus_index is None: @@ -284,6 +284,12 @@ def semantic_search_usearch( corpus_index = Index( ndim=corpus_embeddings.shape[1], metric="hamming", + dtype="i8", + ) + elif corpus_precision == "ubinary": + corpus_index = Index( + ndim=corpus_embeddings.shape[1] * 8, + metric="hamming", dtype="b1", ) corpus_index.add(np.arange(len(corpus_embeddings)), corpus_embeddings) @@ -331,7 +337,7 @@ def semantic_search_usearch( if rescore_embeddings is not None: top_k_embeddings = np.array([corpus_index.get(query_indices) for query_indices in indices]) # If the corpus precision is binary, we need to unpack the bits - if corpus_precision == "binary": + if corpus_precision in ("ubinary", "binary"): top_k_embeddings = np.unpackbits(top_k_embeddings.astype(np.uint8), axis=-1) top_k_embeddings = top_k_embeddings.astype(int) From 2fa3ed4f2d829911c3301f73dede3835ba4c09f9 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 18 Oct 2024 14:03:47 +0200 Subject: [PATCH 06/12] Fix copy-paste error in a comment --- sentence_transformers/models/Transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py index 7592278bf..061098c37 100644 --- a/sentence_transformers/models/Transformer.py +++ b/sentence_transformers/models/Transformer.py @@ -155,7 +155,7 @@ def _load_openvino_model(self, model_name_or_path, config, cache_dir, **model_ar else: model_args["ov_config"] = {} - # Either load an exported model, or export the model to ONNX + # Either load an exported model, or export the model to OpenVINO self.auto_model: OVModelForFeatureExtraction = OVModelForFeatureExtraction.from_pretrained( model_name_or_path, config=config, From 5e1a7a421d09d5ab200c3775fdc0829159f79f9b Mon Sep 17 00:00:00 2001 From: yaohwang Date: Fri, 18 Oct 2024 20:38:14 +0800 Subject: [PATCH 07/12] [enh] Add support for large_string in model card create (#2999) * [enh] Add support for large_string in model card create * [enh] Add support for large_string in model card create, with pre-commit checked --- sentence_transformers/model_card.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sentence_transformers/model_card.py b/sentence_transformers/model_card.py index 99da35d96..401d3b5a9 100644 --- a/sentence_transformers/model_card.py +++ b/sentence_transformers/model_card.py @@ -423,7 +423,9 @@ def set_widget_examples(self, dataset: Dataset | DatasetDict) -> None: columns = [ column for column, feature in dataset[dataset_name].features.items() - if isinstance(feature, Value) and feature.dtype == "string" and column != "dataset_name" + if isinstance(feature, Value) + and (feature.dtype == "string" or feature.dtype == "large_string") + and column != "dataset_name" ] str_dataset = dataset[dataset_name].select_columns(columns) dataset_size = len(str_dataset) From 0e59af636cc10ffa66caf25d12a5faf8f065bf5c Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Fri, 18 Oct 2024 14:38:33 +0200 Subject: [PATCH 08/12] [`model cards`] Prevent crash on generating widgets if dataset column is empty (#2997) (or if it has no string columns) --- sentence_transformers/model_card.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sentence_transformers/model_card.py b/sentence_transformers/model_card.py index 401d3b5a9..ef279e11f 100644 --- a/sentence_transformers/model_card.py +++ b/sentence_transformers/model_card.py @@ -429,6 +429,9 @@ def set_widget_examples(self, dataset: Dataset | DatasetDict) -> None: ] str_dataset = dataset[dataset_name].select_columns(columns) dataset_size = len(str_dataset) + if dataset_size == 0: + continue + lengths = {} for idx, sample in enumerate( str_dataset.select(random.sample(range(dataset_size), k=min(num_samples_to_check, dataset_size))) From 29535eb44d270f5665e389aeb7e18cf04cb51bf0 Mon Sep 17 00:00:00 2001 From: Thomas van Dongen Date: Fri, 18 Oct 2024 14:38:55 +0200 Subject: [PATCH 09/12] [fix] Added model2vec import compatible with current and newer version (#2992) * Added model2vec import compatible with current and newer version * Switched to importlib for version check * Added catch for PackageNotFoundError * Simplified code * Ran precommit * Add tests for Static Embeddings * Add support for future model2vec version based on numpy * Add model2vec to dev extra, as it's needed for tests * Skip tests if no model2vec, install model2vec only for Python 3.10+ * Keep installing dev on GitHub CI --------- Co-authored-by: Tom Aarsen --- .github/workflows/tests.yml | 4 + .../models/StaticEmbedding.py | 16 +++- tests/models/test_static_embedding.py | 76 +++++++++++++++++++ 3 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 tests/models/test_static_embedding.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f803de40a..12c619375 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -63,6 +63,10 @@ jobs: python -m pip install --upgrade pip python -m pip install '.[train, onnx, openvino, dev]' + - name: Install model2vec + run: python -m pip install model2vec + if: ${{ contains(fromJSON('["3.10", "3.11", "3.12"]'), matrix.python-version) }} + - name: Run unit tests run: | python -m pytest --durations 20 -sv tests/ diff --git a/sentence_transformers/models/StaticEmbedding.py b/sentence_transformers/models/StaticEmbedding.py index de69285b2..fae3756e2 100644 --- a/sentence_transformers/models/StaticEmbedding.py +++ b/sentence_transformers/models/StaticEmbedding.py @@ -159,9 +159,11 @@ def from_distillation( """ try: - from model2vec import distill + from model2vec.distill import distill except ImportError: - raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`") + raise ImportError( + "To use this method, please install the `model2vec` package: `pip install model2vec[distill]`" + ) device = get_device_name() static_model = distill( @@ -172,7 +174,10 @@ def from_distillation( apply_zipf=apply_zipf, use_subword=use_subword, ) - embedding_weights = static_model.embedding.weight + if isinstance(static_model.embedding, np.ndarray): + embedding_weights = torch.from_numpy(static_model.embedding) + else: + embedding_weights = static_model.embedding.weight tokenizer: Tokenizer = static_model.tokenizer return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_name) @@ -200,7 +205,10 @@ def from_model2vec(cls, model_id_or_path: str) -> StaticEmbedding: raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`") static_model = StaticModel.from_pretrained(model_id_or_path) - embedding_weights = static_model.embedding.weight + if isinstance(static_model.embedding, np.ndarray): + embedding_weights = torch.from_numpy(static_model.embedding) + else: + embedding_weights = static_model.embedding.weight tokenizer: Tokenizer = static_model.tokenizer return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_id_or_path) diff --git a/tests/models/test_static_embedding.py b/tests/models/test_static_embedding.py new file mode 100644 index 000000000..75041d852 --- /dev/null +++ b/tests/models/test_static_embedding.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest +from tokenizers import Tokenizer + +from sentence_transformers.models.StaticEmbedding import StaticEmbedding + +try: + import model2vec +except ImportError: + model2vec = None + +skip_if_no_model2vec = pytest.mark.skipif(model2vec is None, reason="The model2vec library is not installed.") + + +@pytest.fixture +def tokenizer() -> Tokenizer: + return Tokenizer.from_pretrained("bert-base-uncased") + + +@pytest.fixture +def embedding_weights(): + return np.random.rand(30522, 768) + + +@pytest.fixture +def static_embedding(tokenizer: Tokenizer, embedding_weights) -> StaticEmbedding: + return StaticEmbedding(tokenizer, embedding_weights=embedding_weights) + + +def test_initialization_with_embedding_weights(tokenizer: Tokenizer, embedding_weights) -> None: + model = StaticEmbedding(tokenizer, embedding_weights=embedding_weights) + assert model.embedding.weight.shape == (30522, 768) + + +def test_initialization_with_embedding_dim(tokenizer: Tokenizer) -> None: + model = StaticEmbedding(tokenizer, embedding_dim=768) + assert model.embedding.weight.shape == (30522, 768) + + +def test_tokenize(static_embedding: StaticEmbedding) -> None: + texts = ["Hello world!", "How are you?"] + tokens = static_embedding.tokenize(texts) + assert "input_ids" in tokens + assert "offsets" in tokens + + +def test_forward(static_embedding: StaticEmbedding) -> None: + texts = ["Hello world!", "How are you?"] + tokens = static_embedding.tokenize(texts) + output = static_embedding(tokens) + assert "sentence_embedding" in output + + +def test_save_and_load(tmp_path: Path, static_embedding: StaticEmbedding) -> None: + save_dir = tmp_path / "model" + save_dir.mkdir() + static_embedding.save(str(save_dir)) + + loaded_model = StaticEmbedding.load(str(save_dir)) + assert loaded_model.embedding.weight.shape == static_embedding.embedding.weight.shape + + +@skip_if_no_model2vec() +def test_from_distillation() -> None: + model = StaticEmbedding.from_distillation("sentence-transformers-testing/stsb-bert-tiny-safetensors", pca_dims=32) + assert model.embedding.weight.shape == (29528, 32) + + +@skip_if_no_model2vec() +def test_from_model2vec() -> None: + model = StaticEmbedding.from_model2vec("minishlab/M2V_base_output") + assert model.embedding.weight.shape == (29528, 256) From dc79f13d8debcf2ee220058a662a87f74113c0c9 Mon Sep 17 00:00:00 2001 From: Bo Date: Mon, 21 Oct 2024 05:04:33 -0500 Subject: [PATCH 10/12] Fix cache_dir issue with loading CLIPModel (#3007) * Fix cache_dir issue with loading CLIPModel * Clarify that you must use Transformer-based models in ONNX export --------- Co-authored-by: Tom Aarsen --- sentence_transformers/SentenceTransformer.py | 8 ++++---- sentence_transformers/backend.py | 8 ++++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py index 1a8cb2efb..4fd069f7d 100644 --- a/sentence_transformers/SentenceTransformer.py +++ b/sentence_transformers/SentenceTransformer.py @@ -1718,10 +1718,10 @@ def _load_sbert_model( # Try to initialize the module with a lot of kwargs, but only if the module supports them # Otherwise we fall back to the load method - # try: - module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs) - # except TypeError: - # module = module_class.load(model_name_or_path) + try: + module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs) + except TypeError: + module = module_class.load(model_name_or_path) else: # Normalize does not require any files to be loaded if module_class == Normalize: diff --git a/sentence_transformers/backend.py b/sentence_transformers/backend.py index eef76352e..355f40d83 100644 --- a/sentence_transformers/backend.py +++ b/sentence_transformers/backend.py @@ -78,7 +78,9 @@ def export_optimized_onnx_model( or not isinstance(model[0], Transformer) or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction) ): - raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.') + raise ValueError( + 'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.' + ) ort_model: ORTModelForFeatureExtraction = model[0].auto_model optimizer = ORTOptimizer.from_pretrained(ort_model) @@ -158,7 +160,9 @@ def export_dynamic_quantized_onnx_model( or not isinstance(model[0], Transformer) or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction) ): - raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.') + raise ValueError( + 'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.' + ) ort_model: ORTModelForFeatureExtraction = model[0].auto_model quantizer = ORTQuantizer.from_pretrained(ort_model) From a028b583ca2e3ea19583d33e0a38cf5f34b9c257 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 21 Oct 2024 12:05:35 +0200 Subject: [PATCH 11/12] [`warn`] Throw a warning if compute_metrics is set, as it's not used (#3002) * Throw a warning if compute_metrics is set, as it's not used * Remove "this will become a fatal error" * Remove unneeded comma --- sentence_transformers/trainer.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sentence_transformers/trainer.py b/sentence_transformers/trainer.py index 97b0f13df..50115f6b5 100644 --- a/sentence_transformers/trainer.py +++ b/sentence_transformers/trainer.py @@ -2,7 +2,6 @@ import logging import os -import warnings from collections import OrderedDict from contextlib import nullcontext from typing import TYPE_CHECKING, Any, Callable @@ -156,14 +155,19 @@ def __init__( raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument") else: if model_init is not None: - warnings.warn( + logger.warning( "`Trainer` requires either a `model` or `model_init` argument, but not both. `model_init` will" - " overwrite your model when calling the `train` method. This will become a fatal error in the next" - " release.", - FutureWarning, + " overwrite your model when calling the `train` method." ) self.model_init = model_init + if compute_metrics is not None: + logger.warning( + "`compute_metrics` is currently not compatible with the SentenceTransformerTrainer. Please use the " + "`evaluator` argument instead for detailed evaluation metrics, or the `eval_dataset` argument for " + "the evaluation loss." + ) + # Get a dictionary of the default training arguments, so we can determine which arguments have been changed # for the model card default_args_dict = SentenceTransformerTrainingArguments(output_dir="unused").to_dict() From f286d9f210824d6ea1563e789f49894b19c24f0e Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 21 Oct 2024 12:58:02 +0200 Subject: [PATCH 12/12] [`fix`] Prevent IndexError if output_hidden_states & ONNX (#3008) --- sentence_transformers/models/Transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py index 061098c37..fca50225a 100644 --- a/sentence_transformers/models/Transformer.py +++ b/sentence_transformers/models/Transformer.py @@ -352,8 +352,8 @@ def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torc features.update({"token_embeddings": output_tokens, "attention_mask": features["attention_mask"]}) - if self.auto_model.config.output_hidden_states: - all_layer_idx = 2 + if self.auto_model.config.output_hidden_states and len(output_states) > 2: + all_layer_idx = 2 # I.e. after last_hidden_states and pooler_output if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states all_layer_idx = 1