Skip to content

Commit

Permalink
Merge branch 'master' into v3.2-release; version to 3.2.1
Browse files Browse the repository at this point in the history
  • Loading branch information
tomaarsen committed Oct 21, 2024
2 parents 539bf92 + f286d9f commit dd76c03
Show file tree
Hide file tree
Showing 20 changed files with 219 additions and 71 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ jobs:
python -m pip install --upgrade pip
python -m pip install '.[train, onnx, openvino, dev]'
- name: Install model2vec
run: python -m pip install model2vec
if: ${{ contains(fromJSON('["3.10", "3.11", "3.12"]'), matrix.python-version) }}

- name: Run unit tests
run: |
python -m pytest --durations 20 -sv tests/
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sentence_transformers.quantization import quantize_embeddings, semantic_search_usearch

# 1. Load the quora corpus with questions
dataset = load_dataset("quora", split="train").map(
dataset = load_dataset("quora", split="train", trust_remote_code=True).map(
lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
batched=True,
remove_columns=["questions", "is_duplicate"],
Expand All @@ -26,7 +26,7 @@
# 4. Choose a target precision for the corpus embeddings
corpus_precision = "binary"
# Valid options are: "float32", "uint8", "int8", "ubinary", and "binary"
# But usearch only supports "float32", "int8", and "binary"
# But usearch only supports "float32", "int8", "binary" and "ubinary"

# 5. Encode the corpus
full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "sentence-transformers"
version = "3.2.0"
version = "3.2.1"
description = "State-of-the-Art Text Embeddings"
license = { text = "Apache 2.0" }
readme = "README.md"
Expand Down Expand Up @@ -49,8 +49,8 @@ Repository = "https://github.com/UKPLab/sentence-transformers/"

[project.optional-dependencies]
train = ["datasets", "accelerate>=0.20.3"]
onnx = ["optimum[onnxruntime]>=1.23.0"]
onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.0"]
onnx = ["optimum[onnxruntime]>=1.23.1"]
onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.1"]
openvino = ["optimum-intel[openvino]>=1.20.0"]
dev = ["datasets", "accelerate>=0.20.3", "pre-commit", "pytest", "pytest-cov"]

Expand Down Expand Up @@ -100,4 +100,4 @@ testpaths = [
addopts = "--strict-markers -m 'not slow'"
markers = [
"slow: marks tests as slow"
]
]
8 changes: 4 additions & 4 deletions sentence_transformers/SentenceTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1718,10 +1718,10 @@ def _load_sbert_model(

# Try to initialize the module with a lot of kwargs, but only if the module supports them
# Otherwise we fall back to the load method
# try:
module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
# except TypeError:
# module = module_class.load(model_name_or_path)
try:
module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
except TypeError:
module = module_class.load(model_name_or_path)
else:
# Normalize does not require any files to be loaded
if module_class == Normalize:
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

__version__ = "3.2.0"
__version__ = "3.2.1"
__MODEL_HUB_ORGANIZATION__ = "sentence-transformers"

import importlib
Expand Down
8 changes: 6 additions & 2 deletions sentence_transformers/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ def export_optimized_onnx_model(
or not isinstance(model[0], Transformer)
or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction)
):
raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.')
raise ValueError(
'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.'
)

ort_model: ORTModelForFeatureExtraction = model[0].auto_model
optimizer = ORTOptimizer.from_pretrained(ort_model)
Expand Down Expand Up @@ -158,7 +160,9 @@ def export_dynamic_quantized_onnx_model(
or not isinstance(model[0], Transformer)
or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction)
):
raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.')
raise ValueError(
'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.'
)

ort_model: ORTModelForFeatureExtraction = model[0].auto_model
quantizer = ORTQuantizer.from_pretrained(ort_model)
Expand Down
7 changes: 6 additions & 1 deletion sentence_transformers/losses/CachedGISTEmbedLoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from torch.utils.checkpoint import get_device_states, set_device_states

from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer
from sentence_transformers.models import StaticEmbedding, Transformer


class RandContext:
Expand Down Expand Up @@ -139,6 +139,11 @@ def __init__(
trainer.train()
"""
super().__init__()
if isinstance(model[0], StaticEmbedding):
raise ValueError(
"CachedGISTEmbedLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
"Consider using GISTEmbedLoss instead."
)
self.model = model
self.guide = guide
self.temperature = temperature
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from torch.utils.checkpoint import get_device_states, set_device_states

from sentence_transformers import SentenceTransformer, util
from sentence_transformers.models import StaticEmbedding


class RandContext:
Expand Down Expand Up @@ -145,6 +146,12 @@ def __init__(
trainer.train()
"""
super().__init__()
if isinstance(model[0], StaticEmbedding):
raise ValueError(
"CachedMultipleNegativesRankingLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
"Consider using MultipleNegativesRankingLoss instead."
)

self.model = model
self.scale = scale
self.similarity_fct = similarity_fct
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from sentence_transformers import SentenceTransformer, util
from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import RandContext
from sentence_transformers.models import StaticEmbedding


def _backward_hook(
Expand Down Expand Up @@ -114,6 +115,12 @@ def __init__(
- Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup: https://arxiv.org/pdf/2101.06983.pdf
"""
super().__init__()
if isinstance(model[0], StaticEmbedding):
raise ValueError(
"CachedMultipleNegativesSymmetricRankingLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
"Consider using MultipleNegativesSymmetricRankingLoss instead."
)

self.model = model
self.scale = scale
self.similarity_fct = similarity_fct
Expand Down
7 changes: 7 additions & 0 deletions sentence_transformers/losses/DenoisingAutoEncoderLoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel

from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -73,6 +74,12 @@ def __init__(
)
"""
super().__init__()

if isinstance(model[0], StaticEmbedding):
raise ValueError(
"DenoisingAutoEncoderLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding."
)

self.encoder = model # This will be the final model used during the inference time.
self.tokenizer_encoder = model.tokenizer

Expand Down
8 changes: 7 additions & 1 deletion sentence_transformers/losses/GISTEmbedLoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch
from torch import Tensor, nn

from sentence_transformers.models import Transformer
from sentence_transformers.models import StaticEmbedding, Transformer
from sentence_transformers.SentenceTransformer import SentenceTransformer


Expand Down Expand Up @@ -91,6 +91,12 @@ def __init__(
if self.must_retokenize:
self.tokenizer = self.model.tokenizer

if isinstance(self.model[0], StaticEmbedding):
raise ValueError(
"If we must retokenize because the guide model has a different tokenizer, "
"then the Sentence Transformer model must not be based on a StaticEmbedding."
)

def sim_matrix(self, embed1: Tensor, embed2: Tensor) -> Tensor:
return self.similarity_fct(embed1.unsqueeze(1), embed2.unsqueeze(0))

Expand Down
26 changes: 14 additions & 12 deletions sentence_transformers/losses/Matryoshka2dLoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,21 +95,23 @@ def __init__(
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
from datasets import Dataset
model = SentenceTransformer("microsoft/mpnet-base")
train_examples = [
InputExample(texts=['Anchor 1', 'Positive 1']),
InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
train_dataset = Dataset.from_dict({
"anchor": ["It's nice weather outside today.", "He drove to work."],
"positive": ["It's so sunny.", "He took the car to the office."],
})
loss = losses.MultipleNegativesRankingLoss(model)
loss = losses.Matryoshka2dLoss(model, loss, [768, 512, 256, 128, 64])
trainer = SentenceTransformerTrainer(
model=model,
train_dataset=train_dataset,
loss=loss,
)
trainer.train()
"""
matryoshka_loss = MatryoshkaLoss(
model,
Expand Down
26 changes: 14 additions & 12 deletions sentence_transformers/losses/MatryoshkaLoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,21 +101,23 @@ def __init__(
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
from datasets import Dataset
model = SentenceTransformer("microsoft/mpnet-base")
train_examples = [
InputExample(texts=['Anchor 1', 'Positive 1']),
InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
train_dataset = Dataset.from_dict({
"anchor": ["It's nice weather outside today.", "He drove to work."],
"positive": ["It's so sunny.", "He took the car to the office."],
})
loss = losses.MultipleNegativesRankingLoss(model)
loss = losses.MatryoshkaLoss(model, loss, [768, 512, 256, 128, 64])
trainer = SentenceTransformerTrainer(
model=model,
train_dataset=train_dataset,
loss=loss,
)
trainer.train()
"""
super().__init__()
self.model = model
Expand Down
33 changes: 19 additions & 14 deletions sentence_transformers/losses/MegaBatchMarginLoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,25 +59,30 @@ def __init__(
Example:
::
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer, losses
from datasets import Dataset
model = SentenceTransformer('all-MiniLM-L6-v2')
total_examples = 500
train_batch_size = 250
train_mini_batch_size = 32
train_examples = [
InputExample(texts=[f"This is sentence number {i}", f"This is sentence number {i+1}"]) for i in range(total_examples)
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
model = SentenceTransformer('all-MiniLM-L6-v2')
train_dataset = Dataset.from_dict({
"anchor": [f"This is sentence number {i}" for i in range(500)],
"positive": [f"This is sentence number {i}" for i in range(1, 501)],
})
loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size)
args = SentenceTransformerTrainingArguments(
output_dir="output",
per_device_train_batch_size=train_batch_size,
)
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
loss=loss,
)
trainer.train()
"""
super().__init__()
self.model = model
Expand Down
7 changes: 6 additions & 1 deletion sentence_transformers/model_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,10 +423,15 @@ def set_widget_examples(self, dataset: Dataset | DatasetDict) -> None:
columns = [
column
for column, feature in dataset[dataset_name].features.items()
if isinstance(feature, Value) and feature.dtype == "string" and column != "dataset_name"
if isinstance(feature, Value)
and (feature.dtype == "string" or feature.dtype == "large_string")
and column != "dataset_name"
]
str_dataset = dataset[dataset_name].select_columns(columns)
dataset_size = len(str_dataset)
if dataset_size == 0:
continue

lengths = {}
for idx, sample in enumerate(
str_dataset.select(random.sample(range(dataset_size), k=min(num_samples_to_check, dataset_size)))
Expand Down
16 changes: 12 additions & 4 deletions sentence_transformers/models/StaticEmbedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,11 @@ def from_distillation(
"""

try:
from model2vec import distill
from model2vec.distill import distill
except ImportError:
raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`")
raise ImportError(
"To use this method, please install the `model2vec` package: `pip install model2vec[distill]`"
)

device = get_device_name()
static_model = distill(
Expand All @@ -172,7 +174,10 @@ def from_distillation(
apply_zipf=apply_zipf,
use_subword=use_subword,
)
embedding_weights = static_model.embedding.weight
if isinstance(static_model.embedding, np.ndarray):
embedding_weights = torch.from_numpy(static_model.embedding)
else:
embedding_weights = static_model.embedding.weight
tokenizer: Tokenizer = static_model.tokenizer

return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_name)
Expand Down Expand Up @@ -200,7 +205,10 @@ def from_model2vec(cls, model_id_or_path: str) -> StaticEmbedding:
raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`")

static_model = StaticModel.from_pretrained(model_id_or_path)
embedding_weights = static_model.embedding.weight
if isinstance(static_model.embedding, np.ndarray):
embedding_weights = torch.from_numpy(static_model.embedding)
else:
embedding_weights = static_model.embedding.weight
tokenizer: Tokenizer = static_model.tokenizer

return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_id_or_path)
6 changes: 3 additions & 3 deletions sentence_transformers/models/Transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def _load_openvino_model(self, model_name_or_path, config, cache_dir, **model_ar
else:
model_args["ov_config"] = {}

# Either load an exported model, or export the model to ONNX
# Either load an exported model, or export the model to OpenVINO
self.auto_model: OVModelForFeatureExtraction = OVModelForFeatureExtraction.from_pretrained(
model_name_or_path,
config=config,
Expand Down Expand Up @@ -352,8 +352,8 @@ def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torc

features.update({"token_embeddings": output_tokens, "attention_mask": features["attention_mask"]})

if self.auto_model.config.output_hidden_states:
all_layer_idx = 2
if self.auto_model.config.output_hidden_states and len(output_states) > 2:
all_layer_idx = 2 # I.e. after last_hidden_states and pooler_output
if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states
all_layer_idx = 1

Expand Down
Loading

0 comments on commit dd76c03

Please sign in to comment.