Skip to content

Commit

Permalink
Merge branch 'master' into pr-2964
Browse files Browse the repository at this point in the history
  • Loading branch information
tomaarsen committed Nov 6, 2024
2 parents d9d485b + 1cb196a commit e3b334c
Show file tree
Hide file tree
Showing 57 changed files with 237 additions and 129 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
name: Run unit tests
strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
python-version: ['3.9', '3.10', '3.11', '3.12']
os: [ubuntu-latest, windows-latest]
fail-fast: false
runs-on: ${{ matrix.os }}
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ For the **full documentation**, see **[www.SBERT.net](https://www.sbert.net)**.

## Installation

We recommend **Python 3.8+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.34.0+](https://github.com/huggingface/transformers)**.
We recommend **Python 3.9+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.34.0+](https://github.com/huggingface/transformers)**.

**Install with pip**

Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Installation

We recommend **Python 3.8+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.41.0+](https://github.com/huggingface/transformers)**. There are 5 extra options to install Sentence Transformers:
We recommend **Python 3.9+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.41.0+](https://github.com/huggingface/transformers)**. There are 5 extra options to install Sentence Transformers:
* **Default:** This allows for loading, saving, and inference (i.e., getting embeddings) of models.
* **ONNX:** This allows for loading, saving, inference, optimizing, and quantizing of models using the ONNX backend.
* **OpenVINO:** This allows for loading, saving, and inference of models using the OpenVINO backend.
Expand Down
16 changes: 8 additions & 8 deletions examples/training/matryoshka/matryoshka_eval_stsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import argparse
import os
from typing import Dict, List, Optional, Tuple, cast
from typing import Optional, cast

import matplotlib.pyplot as plt
import numpy as np
Expand All @@ -21,7 +21,7 @@

# Dimension plot
def _grouped_barplot_ratios(
group_name_to_x_to_y: Dict[str, Dict[int, float]], ax: Optional[plt.Axes] = None
group_name_to_x_to_y: dict[str, dict[int, float]], ax: Optional[plt.Axes] = None
) -> plt.Axes:
# To save a pandas dependency, do from scratch in matplotlib
if ax is None:
Expand Down Expand Up @@ -72,9 +72,9 @@ def _grouped_barplot_ratios(


def plot_across_dimensions(
model_name_to_dim_to_score: Dict[str, Dict[int, float]],
model_name_to_dim_to_score: dict[str, dict[int, float]],
filename: str,
figsize: Tuple[float, float] = (7, 7),
figsize: tuple[float, float] = (7, 7),
title: str = "STSB test score for various embedding dimensions (via truncation),\nwith and without Matryoshka loss",
) -> None:
# Sort each by key
Expand Down Expand Up @@ -139,8 +139,8 @@ def plot_across_dimensions(

args = parser.parse_args()
plot_filename: str = args.plot_filename
model_names: List[str] = args.model_names
DIMENSIONS: List[int] = args.dimensions
model_names: list[str] = args.model_names
DIMENSIONS: list[int] = args.dimensions

# Load STSb
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
Expand All @@ -153,10 +153,10 @@ def plot_across_dimensions(
)

# Run test_evaluator
model_name_to_dim_to_score: Dict[str, Dict[int, float]] = {}
model_name_to_dim_to_score: dict[str, dict[int, float]] = {}
for model_name in tqdm(model_names, desc="Evaluating models"):
model = SentenceTransformer(model_name)
dim_to_score: Dict[int, float] = {}
dim_to_score: dict[int, float] = {}
for dim in tqdm(DIMENSIONS, desc=f"Evaluating {model_name}"):
output_path = os.path.join(model_name, f"dim-{dim}")
os.makedirs(output_path)
Expand Down
8 changes: 5 additions & 3 deletions examples/training/quora_duplicate_questions/create_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,9 +481,11 @@ def write_mining_files(name, ids, dups):


###### Classification dataset #####
with open("quora-IR-dataset/classification/train_pairs.tsv", "w", encoding="utf8") as fOutTrain, open(
"quora-IR-dataset/classification/dev_pairs.tsv", "w", encoding="utf8"
) as fOutDev, open("quora-IR-dataset/classification/test_pairs.tsv", "w", encoding="utf8") as fOutTest:
with (
open("quora-IR-dataset/classification/train_pairs.tsv", "w", encoding="utf8") as fOutTrain,
open("quora-IR-dataset/classification/dev_pairs.tsv", "w", encoding="utf8") as fOutDev,
open("quora-IR-dataset/classification/test_pairs.tsv", "w", encoding="utf8") as fOutTest,
):
fOutTrain.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
fOutDev.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
fOutTest.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
Expand Down
6 changes: 3 additions & 3 deletions examples/unsupervised_learning/CT/train_ct_from_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@

################# Read the train corpus #################
train_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
with (
gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(filepath, encoding="utf8") as fIn
):
for line in tqdm.tqdm(fIn, desc="Read file"):
line = line.strip()
if len(line) >= 10:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@

################# Read the train corpus #################
train_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
with (
gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(filepath, encoding="utf8") as fIn
):
for line in tqdm.tqdm(fIn, desc="Read file"):
line = line.strip()
if len(line) >= 10:
Expand Down
16 changes: 10 additions & 6 deletions examples/unsupervised_learning/MLM/train_mlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,11 @@

train_sentences = []
train_path = sys.argv[2]
with gzip.open(train_path, "rt", encoding="utf8") if train_path.endswith(".gz") else open(
train_path, encoding="utf8"
) as fIn:
with (
gzip.open(train_path, "rt", encoding="utf8")
if train_path.endswith(".gz")
else open(train_path, encoding="utf8") as fIn
):
for line in fIn:
line = line.strip()
if len(line) >= 10:
Expand All @@ -61,9 +63,11 @@
dev_sentences = []
if len(sys.argv) >= 4:
dev_path = sys.argv[3]
with gzip.open(dev_path, "rt", encoding="utf8") if dev_path.endswith(".gz") else open(
dev_path, encoding="utf8"
) as fIn:
with (
gzip.open(dev_path, "rt", encoding="utf8")
if dev_path.endswith(".gz")
else open(dev_path, encoding="utf8") as fIn
):
for line in fIn:
line = line.strip()
if len(line) >= 10:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@

################# Read the train corpus #################
train_samples = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
with (
gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(filepath, encoding="utf8") as fIn
):
for line in tqdm.tqdm(fIn, desc="Read file"):
line = line.strip()
if len(line) >= 10:
Expand Down
6 changes: 3 additions & 3 deletions examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@

################# Read the train corpus #################
train_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
with (
gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(filepath, encoding="utf8") as fIn
):
for line in tqdm.tqdm(fIn, desc="Read file"):
line = line.strip()
if len(line) >= 10:
Expand Down
2 changes: 1 addition & 1 deletion index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Using Sentence Transformer models is elementary:
pip install -U sentence-transformers
We recommend **Python 3.8+** and **PyTorch 1.11.0+**. See `installation <docs/installation.html>`_ for further installation options.
We recommend **Python 3.9+** and **PyTorch 1.11.0+**. See `installation <docs/installation.html>`_ for further installation options.

.. code-block:: python
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ authors = [
maintainers = [
{ name = "Tom Aarsen", email = "[email protected]" }
]
requires-python = ">=3.8"
requires-python = ">=3.9"
keywords = [
"Transformer Networks",
"BERT",
Expand All @@ -25,7 +25,6 @@ classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
Expand Down
3 changes: 2 additions & 1 deletion sentence_transformers/SentenceTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
import traceback
import warnings
from collections import OrderedDict
from collections.abc import Iterable, Iterator
from contextlib import contextmanager
from multiprocessing import Queue
from pathlib import Path
from typing import Any, Callable, Iterable, Iterator, Literal, overload
from typing import Any, Callable, Literal, overload

import numpy as np
import torch
Expand Down
8 changes: 5 additions & 3 deletions sentence_transformers/datasets/ParallelSentencesDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,11 @@ def load_data(
logger.info("Load " + filepath)
parallel_sentences = []

with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
with (
gzip.open(filepath, "rt", encoding="utf8")
if filepath.endswith(".gz")
else open(filepath, encoding="utf8") as fIn
):
count = 0
for line in fIn:
sentences = line.strip().split("\t")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,10 @@ def compute_metrices(

# Encode chunk of corpus
if corpus_embeddings is None:
with nullcontext() if self.truncate_dim is None else corpus_model.truncate_sentence_embeddings(
self.truncate_dim
with (
nullcontext()
if self.truncate_dim is None
else corpus_model.truncate_sentence_embeddings(self.truncate_dim)
):
sub_corpus_embeddings = corpus_model.encode(
self.corpus[corpus_start_idx:corpus_end_idx],
Expand Down
6 changes: 4 additions & 2 deletions sentence_transformers/evaluation/MSEEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ def __init__(
):
super().__init__()
self.truncate_dim = truncate_dim
with nullcontext() if self.truncate_dim is None else teacher_model.truncate_sentence_embeddings(
self.truncate_dim
with (
nullcontext()
if self.truncate_dim is None
else teacher_model.truncate_sentence_embeddings(self.truncate_dim)
):
self.source_embeddings = teacher_model.encode(
source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True
Expand Down
6 changes: 4 additions & 2 deletions sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,10 @@ def __init__(
self.csv_headers.append(f"{src_lang}-{trg_lang}")

all_source_sentences = list(all_source_sentences)
with nullcontext() if self.truncate_dim is None else teacher_model.truncate_sentence_embeddings(
self.truncate_dim
with (
nullcontext()
if self.truncate_dim is None
else teacher_model.truncate_sentence_embeddings(self.truncate_dim)
):
all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size)
self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)}
Expand Down
101 changes: 62 additions & 39 deletions sentence_transformers/evaluation/NanoBEIREvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,7 @@ class NanoBEIREvaluator(SentenceEvaluator):
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import NanoBEIREvaluator
# Load a model
model = SentenceTransformer('all-mpnet-base-v2')
model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
datasets = ["QuoraRetrieval", "MSMARCO"]
query_prompts = {
Expand All @@ -95,54 +94,78 @@ class NanoBEIREvaluator(SentenceEvaluator):
evaluator = NanoBEIREvaluator(
dataset_names=datasets,
name="NanoBEIR",
query_prompts=query_prompts,
)
results = evaluator(model)
'''
NanoBEIR Evaluation of the model on ['QuoraRetrieval', 'MSMARCO'] dataset:
Evaluating NanoBeIRNanoQuoraRetrieval
Evaluating NanoBeIRNanoMSMARCO
Evaluating NanoQuoraRetrieval
Information Retrieval Evaluation of the model on the NanoQuoraRetrieval dataset:
Queries: 50
Corpus: 5046
Score-Function: cosine
Accuracy@1: 92.00%
Accuracy@3: 98.00%
Accuracy@5: 100.00%
Accuracy@10: 100.00%
Precision@1: 92.00%
Precision@3: 40.67%
Precision@5: 26.00%
Precision@10: 14.00%
Recall@1: 81.73%
Recall@3: 94.20%
Recall@5: 97.93%
Recall@10: 100.00%
MRR@10: 0.9540
NDCG@10: 0.9597
MAP@100: 0.9395
Evaluating NanoMSMARCO
Information Retrieval Evaluation of the model on the NanoMSMARCO dataset:
Queries: 50
Corpus: 5043
Score-Function: cosine
Accuracy@1: 40.00%
Accuracy@3: 74.00%
Accuracy@5: 78.00%
Accuracy@10: 88.00%
Precision@1: 40.00%
Precision@3: 24.67%
Precision@5: 15.60%
Precision@10: 8.80%
Recall@1: 40.00%
Recall@3: 74.00%
Recall@5: 78.00%
Recall@10: 88.00%
MRR@10: 0.5849
NDCG@10: 0.6572
MAP@100: 0.5892
Average Queries: 50.0
Average Corpus: 5044.5
Aggregated for Score Function: cosine
Accuracy@1: 39.00%
Accuracy@3: 57.00%
Accuracy@5: 66.00%
Accuracy@10: 77.00%
Precision@1: 39.00%
Recall@1: 34.03%
Precision@3: 20.67%
Recall@3: 54.07%
Precision@5: 15.00%
Recall@5: 64.27%
Precision@10: 8.90%
Recall@10: 75.97%
MRR@10: 0.5004
NDCG@10: 0.5513
Aggregated for Score Function: dot
Accuracy@1: 39.00%
Accuracy@3: 57.00%
Accuracy@5: 66.00%
Accuracy@10: 77.00%
Precision@1: 39.00%
Recall@1: 34.03%
Precision@3: 20.67%
Recall@3: 54.07%
Precision@5: 15.00%
Recall@5: 64.27%
Precision@10: 8.90%
Recall@10: 75.97%
MRR@10: 0.5004
NDCG@10: 0.5513
Accuracy@1: 66.00%
Accuracy@3: 86.00%
Accuracy@5: 89.00%
Accuracy@10: 94.00%
Precision@1: 66.00%
Recall@1: 60.87%
Precision@3: 32.67%
Recall@3: 84.10%
Precision@5: 20.80%
Recall@5: 87.97%
Precision@10: 11.40%
Recall@10: 94.00%
MRR@10: 0.7694
NDCG@10: 0.8085
'''
logger.info(evaluator.primary_metric)
# => "cosine_ndcg@10"
logger.info(results["mean"][evaluator.primary_metric])
# => 0.5512516989358924
print(evaluator.primary_metric)
# => "NanoBEIR_mean_cosine_ndcg@10"
print(results[evaluator.primary_metric])
# => 0.8084508771660436
"""

def __init__(
Expand Down
Loading

0 comments on commit e3b334c

Please sign in to comment.