From 6d119849525504e57077a967600ddf11586392b3 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 18 Dec 2023 15:45:12 +0100 Subject: [PATCH] Run code formatting on docs (#2389) blacken-docs in particular --- docs/hugging_face.md | 16 ++-- docs/package_reference/SentenceTransformer.md | 3 +- docs/pretrained-models/ce-msmarco.md | 13 ++- docs/pretrained-models/dpr.md | 9 +- docs/pretrained-models/msmarco-v1.md | 7 +- docs/pretrained-models/msmarco-v2.md | 7 +- docs/pretrained-models/msmarco-v3.md | 7 +- docs/pretrained-models/msmarco-v5.md | 11 ++- docs/pretrained-models/nq-v1.md | 11 ++- docs/pretrained_cross-encoders.md | 30 +++--- docs/pretrained_models.md | 25 +++-- docs/quickstart.md | 62 +++++++------ docs/training/overview.md | 93 ++++++++++++------- docs/usage/semantic_textual_similarity.md | 77 ++++++++------- .../computing-embeddings/README.md | 80 +++++++++------- examples/applications/cross-encoder/README.md | 6 +- examples/applications/image-search/README.md | 16 ++-- .../applications/paraphrase-mining/README.md | 20 ++-- .../applications/retrieve_rerank/README.md | 8 +- .../applications/semantic-search/README.md | 4 +- examples/training/cross-encoder/README.md | 22 +++-- examples/training/multilingual/README.md | 20 ++-- .../quora_duplicate_questions/README.md | 87 +++++++++++------ examples/training/sts/README.md | 17 +++- .../unsupervised_learning/SimCSE/README.md | 18 ++-- .../unsupervised_learning/TSDAE/README.md | 26 +++--- .../query_generation/README.md | 11 ++- 27 files changed, 430 insertions(+), 276 deletions(-) diff --git a/docs/hugging_face.md b/docs/hugging_face.md index 04b681eb9..84febcf59 100644 --- a/docs/hugging_face.md +++ b/docs/hugging_face.md @@ -18,7 +18,8 @@ Any pre-trained models from the Hub can be loaded with a single line of code: ```py from sentence_transformers import SentenceTransformer -model = SentenceTransformer('model_name') + +model = SentenceTransformer("model_name") ``` You can even click `Use in sentence-transformers` to get a code snippet that you can copy and paste! @@ -32,11 +33,14 @@ Here is an example that loads the [multi-qa-MiniLM-L6-cos-v1 model](https://hugg ```py from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') -query_embedding = model.encode('How big is London') -passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census', - 'London is known for its finacial district']) +model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1") + +query_embedding = model.encode("How big is London") +passage_embedding = model.encode([ + "London has 9,787,426 inhabitants at the 2011 census", + "London is known for its finacial district", +]) print("Similarity:", util.dot_score(query_embedding, passage_embedding)) ``` @@ -51,7 +55,7 @@ answer_1 = "All plans come with unlimited private models and datasets." answer_2 = "AutoNLP is an automatic way to train and deploy state-of-the-art NLP models, seamlessly integrated with the Hugging Face ecosystem." answer_3 = "Based on how much training data and model variants are created, we send you a compute cost and payment link - as low as $10 per job." -model = SentenceTransformer('clips/mfaq') +model = SentenceTransformer("clips/mfaq") query_embedding = model.encode(question) corpus_embeddings = model.encode([answer_1, answer_2, answer_3]) diff --git a/docs/package_reference/SentenceTransformer.md b/docs/package_reference/SentenceTransformer.md index 857515606..283a7eb05 100644 --- a/docs/package_reference/SentenceTransformer.md +++ b/docs/package_reference/SentenceTransformer.md @@ -3,7 +3,8 @@ This page documents the properties and methods when you load a SentenceTransformer model: ```python from sentence_transformers import SentenceTransformer -model = SentenceTransformer('model-name') + +model = SentenceTransformer("model-name") ``` ```eval_rst diff --git a/docs/pretrained-models/ce-msmarco.md b/docs/pretrained-models/ce-msmarco.md index dfe3bfbea..547d8ee37 100644 --- a/docs/pretrained-models/ce-msmarco.md +++ b/docs/pretrained-models/ce-msmarco.md @@ -8,8 +8,11 @@ The training data consists of over 500k examples, while the complete corpus cons Pre-trained models can be used like this: ```python from sentence_transformers import CrossEncoder -model = CrossEncoder('model_name', max_length=512) -scores = model.predict([('Query', 'Paragraph1'), ('Query', 'Paragraph2') , ('Query', 'Paragraph3')]) + +model = CrossEncoder("model_name", max_length=512) +scores = model.predict( + [("Query", "Paragraph1"), ("Query", "Paragraph2"), ("Query", "Paragraph3")] +) ``` ## Usage with Transformers @@ -18,10 +21,10 @@ scores = model.predict([('Query', 'Paragraph1'), ('Query', 'Paragraph2') , ('Que from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch -model = AutoModelForSequenceClassification.from_pretrained('model_name') -tokenizer = AutoTokenizer.from_pretrained('model_name') +model = AutoModelForSequenceClassification.from_pretrained("model_name") +tokenizer = AutoTokenizer.from_pretrained("model_name") -features = tokenizer(['Query', 'Query'], ['Paragraph1', 'Paragraph2'], padding=True, truncation=True, return_tensors="pt") +features = tokenizer(["Query", "Query"], ["Paragraph1", "Paragraph2"], padding=True, truncation=True, return_tensors="pt") model.eval() with torch.no_grad(): diff --git a/docs/pretrained-models/dpr.md b/docs/pretrained-models/dpr.md index eb3c9007d..44fabef7d 100644 --- a/docs/pretrained-models/dpr.md +++ b/docs/pretrained-models/dpr.md @@ -17,21 +17,22 @@ To encode paragraphs, you need to provide a title (e.g. the Wikipedia article ti Queries are encoded with **question_encoder**: ```python from sentence_transformers import SentenceTransformer, util -passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base') + +passage_encoder = SentenceTransformer("facebook-dpr-ctx_encoder-single-nq-base") passages = [ "London [SEP] London is the capital and largest city of England and the United Kingdom.", "Paris [SEP] Paris is the capital and most populous city of France.", - "Berlin [SEP] Berlin is the capital and largest city of Germany by both area and population." + "Berlin [SEP] Berlin is the capital and largest city of Germany by both area and population.", ] passage_embeddings = passage_encoder.encode(passages) -query_encoder = SentenceTransformer('facebook-dpr-question_encoder-single-nq-base') +query_encoder = SentenceTransformer("facebook-dpr-question_encoder-single-nq-base") query = "What is the capital of England?" query_embedding = query_encoder.encode(query) -#Important: You must use dot-product, not cosine_similarity +# Important: You must use dot-product, not cosine_similarity scores = util.dot_score(query_embedding, passage_embeddings) print("Scores:", scores) ``` diff --git a/docs/pretrained-models/msmarco-v1.md b/docs/pretrained-models/msmarco-v1.md index 55569e5da..be537f2d7 100644 --- a/docs/pretrained-models/msmarco-v1.md +++ b/docs/pretrained-models/msmarco-v1.md @@ -14,10 +14,11 @@ Version 1 models were trained on the training set of MS Marco Passage retrieval They can be used like this: ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('distilroberta-base-msmarco-v1') -query_embedding = model.encode('[QRY] ' + 'How big is London') -passage_embedding = model.encode('[DOC] ' + 'London has 9,787,426 inhabitants at the 2011 census') +model = SentenceTransformer("distilroberta-base-msmarco-v1") + +query_embedding = model.encode("[QRY] " + "How big is London") +passage_embedding = model.encode("[DOC] " + "London has 9,787,426 inhabitants at the 2011 census") print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding)) ``` diff --git a/docs/pretrained-models/msmarco-v2.md b/docs/pretrained-models/msmarco-v2.md index 76e53b2a8..c9a88e4df 100644 --- a/docs/pretrained-models/msmarco-v2.md +++ b/docs/pretrained-models/msmarco-v2.md @@ -6,10 +6,11 @@ The training data consists of over 500k examples, while the complete corpus con ## Usage ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('msmarco-distilroberta-base-v2') -query_embedding = model.encode('How big is London') -passage_embedding = model.encode('London has 9,787,426 inhabitants at the 2011 census') +model = SentenceTransformer("msmarco-distilroberta-base-v2") + +query_embedding = model.encode("How big is London") +passage_embedding = model.encode("London has 9,787,426 inhabitants at the 2011 census") print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding)) ``` diff --git a/docs/pretrained-models/msmarco-v3.md b/docs/pretrained-models/msmarco-v3.md index 1a7ffb5a3..4f6cf5a61 100644 --- a/docs/pretrained-models/msmarco-v3.md +++ b/docs/pretrained-models/msmarco-v3.md @@ -6,10 +6,11 @@ The training data constist of over 500k examples, while the complete corpus con ## Usage ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('msmarco-distilroberta-base-v3') -query_embedding = model.encode('How big is London') -passage_embedding = model.encode('London has 9,787,426 inhabitants at the 2011 census') +model = SentenceTransformer("msmarco-distilroberta-base-v3") + +query_embedding = model.encode("How big is London") +passage_embedding = model.encode("London has 9,787,426 inhabitants at the 2011 census") print("Similarity:", util.cos_sim(query_embedding, passage_embedding)) ``` diff --git a/docs/pretrained-models/msmarco-v5.md b/docs/pretrained-models/msmarco-v5.md index 2e2c62a9b..d3f29ca71 100644 --- a/docs/pretrained-models/msmarco-v5.md +++ b/docs/pretrained-models/msmarco-v5.md @@ -6,11 +6,14 @@ The training data constist of over 500k examples, while the complete corpus con ## Usage ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('msmarco-distilbert-dot-v5') -query_embedding = model.encode('How big is London') -passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census', - 'London is known for its finacial district']) +model = SentenceTransformer("msmarco-distilbert-dot-v5") + +query_embedding = model.encode("How big is London") +passage_embedding = model.encode([ + "London has 9,787,426 inhabitants at the 2011 census", + "London is known for its finacial district", +]) print("Similarity:", util.dot_score(query_embedding, passage_embedding)) ``` diff --git a/docs/pretrained-models/nq-v1.md b/docs/pretrained-models/nq-v1.md index 94a3b3bd1..c7b8201ee 100644 --- a/docs/pretrained-models/nq-v1.md +++ b/docs/pretrained-models/nq-v1.md @@ -5,12 +5,15 @@ ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('nq-distilbert-base-v1') -query_embedding = model.encode('How many people live in London?') +model = SentenceTransformer("nq-distilbert-base-v1") -#The passages are encoded as [ [title1, text1], [title2, text2], ...] -passage_embedding = model.encode([['London', 'London has 9,787,426 inhabitants at the 2011 census.']]) +query_embedding = model.encode("How many people live in London?") + +# The passages are encoded as [ [title1, text1], [title2, text2], ...] +passage_embedding = model.encode( + [["London", "London has 9,787,426 inhabitants at the 2011 census."]] +) print("Similarity:", util.cos_sim(query_embedding, passage_embedding)) ``` diff --git a/docs/pretrained_cross-encoders.md b/docs/pretrained_cross-encoders.md index 62e8c1655..e95097e8d 100644 --- a/docs/pretrained_cross-encoders.md +++ b/docs/pretrained_cross-encoders.md @@ -10,12 +10,15 @@ This page lists available **pretrained Cross-Encoders**. Cross-Encoders require These models can be used like this: ```python from sentence_transformers import CrossEncoder -model = CrossEncoder('model_name', max_length=512) -scores = model.predict([('Query1', 'Paragraph1'), ('Query1', 'Paragraph2')]) -#For Example -scores = model.predict([('How many people live in Berlin?', 'Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.'), - ('How many people live in Berlin?', 'Berlin is well known for its museums.')]) +model = CrossEncoder("model_name", max_length=512) +scores = model.predict([("Query1", "Paragraph1"), ("Query1", "Paragraph2")]) + +# For Example +scores = model.predict([ + ("How many people live in Berlin?", "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers."), + ("How many people live in Berlin?", "Berlin is well known for its museums."), +]) ``` - **cross-encoder/ms-marco-TinyBERT-L-2-v2** - MRR@10 on MS Marco Dev Set: 32.56 @@ -42,8 +45,9 @@ QNLI is based on the [SQuAD dataset](https://rajpurkar.github.io/SQuAD-explorer/ The following models can be used like this: ```python from sentence_transformers import CrossEncoder -model = CrossEncoder('model_name') -scores = model.predict([('Sent A1', 'Sent B1'), ('Sent A2', 'Sent B2')]) + +model = CrossEncoder("model_name") +scores = model.predict([("Sent A1", "Sent B1"), ("Sent A2", "Sent B2")]) ``` They return a score 0...1 indicating the semantic similarity of the given sentence pair. @@ -75,11 +79,15 @@ Given two sentences, are these contradicting each other, entailing one the other ```python from sentence_transformers import CrossEncoder -model = CrossEncoder('model_name') -scores = model.predict([('A man is eating pizza', 'A man eats something'), ('A black race car starts up in front of a crowd of people.', 'A man is driving down a lonely road.')]) -#Convert scores to labels -label_mapping = ['contradiction', 'entailment', 'neutral'] +model = CrossEncoder("model_name") +scores = model.predict([ + ("A man is eating pizza", "A man eats something"), + ("A black race car starts up in front of a crowd of people.", "A man is driving down a lonely road."), +]) + +# Convert scores to labels +label_mapping = ["contradiction", "entailment", "neutral"] labels = [label_mapping[score_max] for score_max in scores.argmax(axis=1)] ``` diff --git a/docs/pretrained_models.md b/docs/pretrained_models.md index 37e4e3a6e..f87903161 100644 --- a/docs/pretrained_models.md +++ b/docs/pretrained_models.md @@ -4,7 +4,8 @@ We provide various pre-trained models. Using these models is easy: ```python from sentence_transformers import SentenceTransformer -model = SentenceTransformer('model_name') + +model = SentenceTransformer("model_name") ``` All models are hosted on the [HuggingFace Model Hub](https://huggingface.co/sentence-transformers). @@ -26,11 +27,14 @@ The following models have been specifically trained for **Semantic Search**: Giv ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') -query_embedding = model.encode('How big is London') -passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census', - 'London is known for its finacial district']) +model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1") + +query_embedding = model.encode("How big is London") +passage_embedding = model.encode([ + "London has 9,787,426 inhabitants at the 2011 census", + "London is known for its finacial district", +]) print("Similarity:", util.dot_score(query_embedding, passage_embedding)) ``` @@ -144,12 +148,15 @@ The following models were trained on [Google's Natural Questions dataset](https: ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('nq-distilbert-base-v1') -query_embedding = model.encode('How many people live in London?') +model = SentenceTransformer("nq-distilbert-base-v1") + +query_embedding = model.encode("How many people live in London?") -#The passages are encoded as [ [title1, text1], [title2, text2], ...] -passage_embedding = model.encode([['London', 'London has 9,787,426 inhabitants at the 2011 census.']]) +# The passages are encoded as [ [title1, text1], [title2, text2], ...] +passage_embedding = model.encode( + [["London", "London has 9,787,426 inhabitants at the 2011 census."]] +) print("Similarity:", util.cos_sim(query_embedding, passage_embedding)) ``` diff --git a/docs/quickstart.md b/docs/quickstart.md index 7fda17822..1fcb776de 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -2,17 +2,20 @@ Once you have SentenceTransformers [installed](installation.md), the usage is simple: ```python from sentence_transformers import SentenceTransformer -model = SentenceTransformer('all-MiniLM-L6-v2') -#Our sentences we like to encode -sentences = ['This framework generates embeddings for each input sentence', - 'Sentences are passed as a list of string.', - 'The quick brown fox jumps over the lazy dog.'] +model = SentenceTransformer("all-MiniLM-L6-v2") -#Sentences are encoded by calling model.encode() +# Our sentences we like to encode +sentences = [ + "This framework generates embeddings for each input sentence", + "Sentences are passed as a list of string.", + "The quick brown fox jumps over the lazy dog.", +] + +# Sentences are encoded by calling model.encode() sentence_embeddings = model.encode(sentences) -#Print the embeddings +# Print the embeddings for sentence, embedding in zip(sentences, sentence_embeddings): print("Sentence:", sentence) print("Embedding:", embedding) @@ -30,9 +33,10 @@ The sentences (texts) are mapped such that sentences with similar meanings are c ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('all-MiniLM-L6-v2') -#Sentences are encoded by calling model.encode() +model = SentenceTransformer("all-MiniLM-L6-v2") + +# Sentences are encoded by calling model.encode() emb1 = model.encode("This is a red cat with a hat.") emb2 = model.encode("Have you seen my red cat?") @@ -43,32 +47,34 @@ print("Cosine-Similarity:", cos_sim) If you have a list with more sentences, you can use the following code example: ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('all-MiniLM-L6-v2') - -sentences = ['A man is eating food.', - 'A man is eating a piece of bread.', - 'The girl is carrying a baby.', - 'A man is riding a horse.', - 'A woman is playing violin.', - 'Two men pushed carts through the woods.', - 'A man is riding a white horse on an enclosed ground.', - 'A monkey is playing drums.', - 'Someone in a gorilla costume is playing a set of drums.' - ] - -#Encode all sentences + +model = SentenceTransformer("all-MiniLM-L6-v2") + +sentences = [ + "A man is eating food.", + "A man is eating a piece of bread.", + "The girl is carrying a baby.", + "A man is riding a horse.", + "A woman is playing violin.", + "Two men pushed carts through the woods.", + "A man is riding a white horse on an enclosed ground.", + "A monkey is playing drums.", + "Someone in a gorilla costume is playing a set of drums.", +] + +# Encode all sentences embeddings = model.encode(sentences) -#Compute cosine similarity between all pairs +# Compute cosine similarity between all pairs cos_sim = util.cos_sim(embeddings, embeddings) -#Add all pairs to a list with their cosine similarity score +# Add all pairs to a list with their cosine similarity score all_sentence_combinations = [] -for i in range(len(cos_sim)-1): - for j in range(i+1, len(cos_sim)): +for i in range(len(cos_sim) - 1): + for j in range(i + 1, len(cos_sim)): all_sentence_combinations.append([cos_sim[i][j], i, j]) -#Sort list by the highest cosine similarity score +# Sort list by the highest cosine similarity score all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True) print("Top-5 most similar pairs:") diff --git a/docs/training/overview.md b/docs/training/overview.md index ea0e73640..bd07febd0 100644 --- a/docs/training/overview.md +++ b/docs/training/overview.md @@ -28,7 +28,7 @@ But we can create the networks architectures from scratch by defining the indivi ```python from sentence_transformers import SentenceTransformer, models -word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256) +word_embedding_model = models.Transformer("bert-base-uncased", max_seq_length=256) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) @@ -41,9 +41,13 @@ We can also construct more complex models: from sentence_transformers import SentenceTransformer, models from torch import nn -word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256) +word_embedding_model = models.Transformer("bert-base-uncased", max_seq_length=256) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) -dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) +dense_model = models.Dense( + in_features=pooling_model.get_sentence_embedding_dimension(), + out_features=256, + activation_function=nn.Tanh(), +) model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model]) ``` @@ -55,7 +59,7 @@ Additionally, we can also create SentenceTransformer models from scratch for ima ```py from sentence_transformers import SentenceTransformer, models -image_embedding_model = models.CLIPModel('openai/clip-vit-base-patch32') +image_embedding_model = models.CLIPModel("openai/clip-vit-base-patch32") model = SentenceTransformer(modules=[image_embedding_model]) ``` @@ -66,13 +70,15 @@ For all available building blocks see [ยป Models Package Reference](../package_r To represent our training data, we use the `InputExample` class to store training examples. As parameters, it accepts texts, which is a list of strings representing our pairs (or triplets). Further, we can also pass a label (either float or int). The following shows a simple example, where we pass text pairs to `InputExample` together with a label indicating the semantic similarity. ```python -from sentence_transformers import SentenceTransformer, InputExample -from torch.utils.data import DataLoader - -model = SentenceTransformer('distilbert-base-nli-mean-tokens') -train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8), - InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)] -train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) + from sentence_transformers import SentenceTransformer, InputExample + from torch.utils.data import DataLoader + + model = SentenceTransformer("distilbert-base-nli-mean-tokens") + train_examples = [ + InputExample(texts=["My first sentence", "My second sentence"], label=0.8), + InputExample(texts=["Another pair", "Unrelated sentence"], label=0.3), + ] + train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) ``` We wrap our `train_examples` with the standard PyTorch `DataLoader`, which shuffles our data and produces batches of certain sizes. @@ -101,18 +107,20 @@ A minimal example with `CosineSimilarityLoss` is the following: from sentence_transformers import SentenceTransformer, InputExample, losses from torch.utils.data import DataLoader -#Define the model. Either from scratch of by loading a pre-trained model -model = SentenceTransformer('distilbert-base-nli-mean-tokens') +# Define the model. Either from scratch of by loading a pre-trained model +model = SentenceTransformer("distilbert-base-nli-mean-tokens") -#Define your train examples. You need more than just two examples... -train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8), - InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)] +# Define your train examples. You need more than just two examples... +train_examples = [ + InputExample(texts=["My first sentence", "My second sentence"], label=0.8), + InputExample(texts=["Another pair", "Unrelated sentence"], label=0.3), +] -#Define your train dataset, the dataloader and the train loss +# Define your train dataset, the dataloader and the train loss train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) train_loss = losses.CosineSimilarityLoss(model) -#Tune the model +# Tune the model model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100) ``` @@ -133,15 +141,30 @@ During training, we usually want to measure the performance to see if the perfor The usage is simple: ```python from sentence_transformers import evaluation -sentences1 = ['This list contains the first column', 'With your sentences', 'You want your model to evaluate on'] -sentences2 = ['Sentences contains the other column', 'The evaluator matches sentences1[i] with sentences2[i]', 'Compute the cosine similarity and compares it to scores[i]'] + +sentences1 = [ + "This list contains the first column", + "With your sentences", + "You want your model to evaluate on", +] +sentences2 = [ + "Sentences contains the other column", + "The evaluator matches sentences1[i] with sentences2[i]", + "Compute the cosine similarity and compares it to scores[i]", +] scores = [0.3, 0.6, 0.2] evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores) # ... Your other code to load training data -model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator, evaluation_steps=500) +model.fit( + train_objectives=[(train_dataloader, train_loss)], + epochs=1, + warmup_steps=100, + evaluator=evaluator, + evaluation_steps=500, +) ``` @@ -151,7 +174,7 @@ model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_st First, we load a pre-trained model from the server: ```python -model = SentenceTransformer('bert-base-nli-mean-tokens') +model = SentenceTransformer("bert-base-nli-mean-tokens") ``` @@ -160,26 +183,30 @@ The next steps are as before. We specify training and dev data: train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) -evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples('sts-dev.csv')) +evaluator = EmbeddingSimilarityEvaluator.from_input_examples( + sts_reader.get_examples("sts-dev.csv") +) ``` In that example, we use CosineSimilarityLoss, which computes the cosine similarity between two sentences and compares this score with a provided gold similarity score. Then we can train as before: ```python -model.fit(train_objectives=[(train_dataloader, train_loss)], - evaluator=evaluator, - epochs=num_epochs, - evaluation_steps=1000, - warmup_steps=warmup_steps, - output_path=model_save_path) +model.fit( + train_objectives=[(train_dataloader, train_loss)], + evaluator=evaluator, + epochs=num_epochs, + evaluation_steps=1000, + warmup_steps=warmup_steps, + output_path=model_save_path, +) ``` ## Loading Custom SentenceTransformer Models Loading trained models is easy. You can specify a path: ```python -model = SentenceTransformer('./my/path/to/model/') +model = SentenceTransformer("./my/path/to/model/") ``` Note: It is important that a / or \ is present in the path, otherwise, it is not recognized as a path. @@ -200,7 +227,8 @@ This code allows multi-task learning with training data from different datasets Depending on the task, you might want to add special tokens to the tokenizer and the Transformer model. You can use the following code-snippet to achieve this: ```python from sentence_transformers import SentenceTransformer, models -word_embedding_model = models.Transformer('bert-base-uncased') + +word_embedding_model = models.Transformer("bert-base-uncased") tokens = ["[DOC]", "[QRY]"] word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True) @@ -213,7 +241,8 @@ model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) If you want to extend the vocabulary for an existent SentenceTransformer model, you can use the following code: ```python from sentence_transformers import SentenceTransformer, models -model = SentenceTransformer('all-MiniLM-L6-v2') + +model = SentenceTransformer("all-MiniLM-L6-v2") word_embedding_model = model._first_module() tokens = ["[DOC]", "[QRY]"] diff --git a/docs/usage/semantic_textual_similarity.md b/docs/usage/semantic_textual_similarity.md index fc77f66e2..dbd0d5198 100644 --- a/docs/usage/semantic_textual_similarity.md +++ b/docs/usage/semantic_textual_similarity.md @@ -4,27 +4,34 @@ Once you have [sentence embeddings computed](../../examples/applications/comput ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('all-MiniLM-L6-v2') -# Two lists of sentences -sentences1 = ['The cat sits outside', - 'A man is playing guitar', - 'The new movie is awesome'] - -sentences2 = ['The dog plays in the garden', - 'A woman watches TV', - 'The new movie is so great'] +model = SentenceTransformer("all-MiniLM-L6-v2") -#Compute embedding for both lists +# Two lists of sentences +sentences1 = [ + "The cat sits outside", + "A man is playing guitar", + "The new movie is awesome", +] + +sentences2 = [ + "The dog plays in the garden", + "A woman watches TV", + "The new movie is so great", +] + +# Compute embedding for both lists embeddings1 = model.encode(sentences1, convert_to_tensor=True) embeddings2 = model.encode(sentences2, convert_to_tensor=True) -#Compute cosine-similarities +# Compute cosine-similarities cosine_scores = util.cos_sim(embeddings1, embeddings2) -#Output the pairs with their score +# Output the pairs with their score for i in range(len(sentences1)): - print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i])) + print("{} \t\t {} \t\t Score: {:.4f}".format( + sentences1[i], sentences2[i], cosine_scores[i][i] + )) ``` We pass the `convert_to_tensor=True` parameter to the encode function. This will return a pytorch tensor containing our embeddings. We can then call `util.cos_sim(A, B)` which computes the cosine similarity between all vectors in *A* and all vectors in *B*. @@ -36,36 +43,40 @@ You can use this function also to find out the pairs with the highest cosine sim ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('all-MiniLM-L6-v2') +model = SentenceTransformer("all-MiniLM-L6-v2") # Single list of sentences -sentences = ['The cat sits outside', - 'A man is playing guitar', - 'I love pasta', - 'The new movie is awesome', - 'The cat plays in the garden', - 'A woman watches TV', - 'The new movie is so great', - 'Do you like pizza?'] - -#Compute embeddings +sentences = [ + "The cat sits outside", + "A man is playing guitar", + "I love pasta", + "The new movie is awesome", + "The cat plays in the garden", + "A woman watches TV", + "The new movie is so great", + "Do you like pizza?", +] + +# Compute embeddings embeddings = model.encode(sentences, convert_to_tensor=True) -#Compute cosine-similarities for each sentence with each other sentence +# Compute cosine-similarities for each sentence with each other sentence cosine_scores = util.cos_sim(embeddings, embeddings) -#Find the pairs with the highest cosine similarity scores +# Find the pairs with the highest cosine similarity scores pairs = [] -for i in range(len(cosine_scores)-1): - for j in range(i+1, len(cosine_scores)): - pairs.append({'index': [i, j], 'score': cosine_scores[i][j]}) +for i in range(len(cosine_scores) - 1): + for j in range(i + 1, len(cosine_scores)): + pairs.append({"index": [i, j], "score": cosine_scores[i][j]}) -#Sort scores in decreasing order -pairs = sorted(pairs, key=lambda x: x['score'], reverse=True) +# Sort scores in decreasing order +pairs = sorted(pairs, key=lambda x: x["score"], reverse=True) for pair in pairs[0:10]: - i, j = pair['index'] - print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score'])) + i, j = pair["index"] + print("{} \t\t {} \t\t Score: {:.4f}".format( + sentences[i], sentences[j], pair["score"] + )) ``` Note, in the above approach we use a brute-force approach to find the highest scoring pairs, which has a quadratic complexity. For long lists of sentences, this might be infeasible. If you want find the highest scoring pairs in a long list of sentences, have a look at [Paraphrase Mining](../../examples/applications/paraphrase-mining/README.md). \ No newline at end of file diff --git a/examples/applications/computing-embeddings/README.md b/examples/applications/computing-embeddings/README.md index 4462d531d..d94c1754f 100644 --- a/examples/applications/computing-embeddings/README.md +++ b/examples/applications/computing-embeddings/README.md @@ -5,17 +5,20 @@ The basic function to compute sentence embeddings looks like this: ```python from sentence_transformers import SentenceTransformer -model = SentenceTransformer('all-MiniLM-L6-v2') -#Our sentences we like to encode -sentences = ['This framework generates embeddings for each input sentence', - 'Sentences are passed as a list of strings.', - 'The quick brown fox jumps over the lazy dog.'] +model = SentenceTransformer("all-MiniLM-L6-v2") -#Sentences are encoded by calling model.encode() +# Our sentences we like to encode +sentences = [ + "This framework generates embeddings for each input sentence", + "Sentences are passed as a list of strings.", + "The quick brown fox jumps over the lazy dog.", +] + +# Sentences are encoded by calling model.encode() embeddings = model.encode(sentences) -#Print the embeddings +# Print the embeddings for sentence, embedding in zip(sentences, embeddings): print("Sentence:", sentence) print("Embedding:", embedding) @@ -27,14 +30,15 @@ for sentence, embedding in zip(sentences, embeddings): First, we load a sentence-transformer model: ```python from sentence_transformers import SentenceTransformer -model = SentenceTransformer('model_name_or_path') + +model = SentenceTransformer("model_name_or_path") ``` You can either specify a [pre-trained model](https://www.sbert.net/docs/pretrained_models.html) or you can pass a path on your disc to load the sentence-transformer model from that folder. If available, the model is automatically executed on the GPU. You can specify the device for the model like this: ```python -model = SentenceTransformer('model_name_or_path', device='cuda') +model = SentenceTransformer("model_name_or_path", device="cuda") ``` With *device* any pytorch device (like CPU, cuda, cuda:0 etc.) @@ -54,11 +58,12 @@ By default, the provided methods use a limit of 128 word pieces, longer inputs w ```python from sentence_transformers import SentenceTransformer -model = SentenceTransformer('all-MiniLM-L6-v2') + +model = SentenceTransformer("all-MiniLM-L6-v2") print("Max Sequence Length:", model.max_seq_length) -#Change the length to 200 +# Change the length to 200 model.max_seq_length = 200 print("Max Sequence Length:", model.max_seq_length) @@ -74,23 +79,25 @@ The easiest method is to use *pickle* to store pre-computed embeddings on disc a from sentence_transformers import SentenceTransformer import pickle -model = SentenceTransformer('all-MiniLM-L6-v2') -sentences = ['This framework generates embeddings for each input sentence', - 'Sentences are passed as a list of string.', - 'The quick brown fox jumps over the lazy dog.'] +model = SentenceTransformer("all-MiniLM-L6-v2") +sentences = [ + "This framework generates embeddings for each input sentence", + "Sentences are passed as a list of string.", + "The quick brown fox jumps over the lazy dog.", +] embeddings = model.encode(sentences) -#Store sentences & embeddings on disc -with open('embeddings.pkl', "wb") as fOut: - pickle.dump({'sentences': sentences, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL) +# Store sentences & embeddings on disc +with open("embeddings.pkl", "wb") as fOut: + pickle.dump({"sentences": sentences, "embeddings": embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL) -#Load sentences & embeddings from disc -with open('embeddings.pkl', "rb") as fIn: +# Load sentences & embeddings from disc +with open("embeddings.pkl", "rb") as fIn: stored_data = pickle.load(fIn) - stored_sentences = stored_data['sentences'] - stored_embeddings = stored_data['embeddings'] + stored_sentences = stored_data["sentences"] + stored_embeddings = stored_data["embeddings"] ``` ## Multi-Process / Multi-GPU Encoding @@ -111,34 +118,37 @@ from transformers import AutoTokenizer, AutoModel import torch -#Mean Pooling - Take attention mask into account for correct averaging +# Mean Pooling - Take attention mask into account for correct averaging def mean_pooling(model_output, attention_mask): - token_embeddings = model_output[0] #First element of model_output contains all token embeddings + token_embeddings = model_output[0] # First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) return sum_embeddings / sum_mask +# Sentences we want sentence embeddings for +sentences = [ + "This framework generates embeddings for each input sentence", + "Sentences are passed as a list of string.", + "The quick brown fox jumps over the lazy dog.", +] -#Sentences we want sentence embeddings for -sentences = ['This framework generates embeddings for each input sentence', - 'Sentences are passed as a list of string.', - 'The quick brown fox jumps over the lazy dog.'] - -#Load AutoModel from huggingface model repository +# Load AutoModel from huggingface model repository tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") -#Tokenize sentences -encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt') +# Tokenize sentences +encoded_input = tokenizer( + sentences, padding=True, truncation=True, max_length=128, return_tensors="pt" +) -#Compute token embeddings +# Compute token embeddings with torch.no_grad(): model_output = model(**encoded_input) -#Perform pooling. In this case, mean pooling -sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) +# Perform pooling. In this case, mean pooling +sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"]) ``` diff --git a/examples/applications/cross-encoder/README.md b/examples/applications/cross-encoder/README.md index 007a6bd22..1edb6952a 100644 --- a/examples/applications/cross-encoder/README.md +++ b/examples/applications/cross-encoder/README.md @@ -32,9 +32,9 @@ Bi-Encoders (see [Computing Sentence Embeddings](../computing-embeddings/README. Using Cross-Encoders is quite easy: ```python from sentence_transformers.cross_encoder import CrossEncoder -model = CrossEncoder('model_name_or_path') -scores = model.predict([["My first", "sentence pair"], - ["Second text", "pair"]]) + +model = CrossEncoder("model_name_or_path") +scores = model.predict([["My first", "sentence pair"], ["Second text", "pair"]]) ``` You pass to `model.predict` a list of sentence **pairs**. Note, Cross-Encoder do not work on individual sentence, you have to pass sentence pairs. diff --git a/examples/applications/image-search/README.md b/examples/applications/image-search/README.md index 07f480c97..f995e409e 100644 --- a/examples/applications/image-search/README.md +++ b/examples/applications/image-search/README.md @@ -15,16 +15,18 @@ SentenceTransformers provides a wrapper for the [OpenAI CLIP Model](https://gith from sentence_transformers import SentenceTransformer, util from PIL import Image -#Load CLIP model -model = SentenceTransformer('clip-ViT-B-32') +# Load CLIP model +model = SentenceTransformer("clip-ViT-B-32") -#Encode an image: -img_emb = model.encode(Image.open('two_dogs_in_snow.jpg')) +# Encode an image: +img_emb = model.encode(Image.open("two_dogs_in_snow.jpg")) -#Encode text descriptions -text_emb = model.encode(['Two dogs in the snow', 'A cat on a table', 'A picture of London at night']) +# Encode text descriptions +text_emb = model.encode( + ["Two dogs in the snow", "A cat on a table", "A picture of London at night"] +) -#Compute cosine similarities +# Compute cosine similarities cos_scores = util.cos_sim(img_emb, text_emb) print(cos_scores) ``` diff --git a/examples/applications/paraphrase-mining/README.md b/examples/applications/paraphrase-mining/README.md index f2faa8374..02ae141eb 100644 --- a/examples/applications/paraphrase-mining/README.md +++ b/examples/applications/paraphrase-mining/README.md @@ -8,17 +8,19 @@ For larger collections, *util* offers the *paraphrase_mining* function that can ```python from sentence_transformers import SentenceTransformer, util -model = SentenceTransformer('all-MiniLM-L6-v2') +model = SentenceTransformer("all-MiniLM-L6-v2") # Single list of sentences - Possible tens of thousands of sentences -sentences = ['The cat sits outside', - 'A man is playing guitar', - 'I love pasta', - 'The new movie is awesome', - 'The cat plays in the garden', - 'A woman watches TV', - 'The new movie is so great', - 'Do you like pizza?'] +sentences = [ + "The cat sits outside", + "A man is playing guitar", + "I love pasta", + "The new movie is awesome", + "The cat plays in the garden", + "A woman watches TV", + "The new movie is so great", + "Do you like pizza?", +] paraphrases = util.paraphrase_mining(model, sentences) diff --git a/examples/applications/retrieve_rerank/README.md b/examples/applications/retrieve_rerank/README.md index 4cf9b4fc8..d6d50de95 100644 --- a/examples/applications/retrieve_rerank/README.md +++ b/examples/applications/retrieve_rerank/README.md @@ -49,9 +49,13 @@ The bi-encoder produces embeddings independently for your paragraphs and for you ```python from sentence_transformers import SentenceTransformer -model = SentenceTransformer('model_name') -docs = ["My first paragraph. That contains information", "Python is a programming language."] +model = SentenceTransformer("model_name") + +docs = [ + "My first paragraph. That contains information", + "Python is a programming language.", +] document_embeddings = model.encode(docs) query = "What is Python?" diff --git a/examples/applications/semantic-search/README.md b/examples/applications/semantic-search/README.md index a34583039..54aff7548 100644 --- a/examples/applications/semantic-search/README.md +++ b/examples/applications/semantic-search/README.md @@ -59,10 +59,10 @@ To get the optimal speed for the `util.semantic_search` method, it is advisable Further, we can normalize the corpus embeddings so that each corpus embeddings is of length 1. In that case, we can use dot-product for computing scores. ```python -corpus_embeddings = corpus_embeddings.to('cuda') +corpus_embeddings = corpus_embeddings.to("cuda") corpus_embeddings = util.normalize_embeddings(corpus_embeddings) -query_embeddings = query_embeddings.to('cuda') +query_embeddings = query_embeddings.to("cuda") query_embeddings = util.normalize_embeddings(query_embeddings) hits = util.semantic_search(query_embeddings, corpus_embeddings, score_function=util.dot_score) ``` diff --git a/examples/training/cross-encoder/README.md b/examples/training/cross-encoder/README.md index 8fd959aa2..0b6e48cf6 100644 --- a/examples/training/cross-encoder/README.md +++ b/examples/training/cross-encoder/README.md @@ -14,19 +14,21 @@ The `CrossEncoder` class is a wrapper around Huggingface `AutoModelForSequenceCl First, you need some sentence pair data. You can either have a continuous score, like: ```python from sentence_transformers import InputExample + train_samples = [ - InputExample(texts=['sentence1', 'sentence2'], label=0.3), - InputExample(texts=['Another', 'pair'], label=0.8), + InputExample(texts=["sentence1", "sentence2"], label=0.3), + InputExample(texts=["Another", "pair"], label=0.8), ] ``` Or you have distinct classes as in the [training_nli.py](training_nli.py) example: ```python from sentence_transformers import InputExample + label2int = {"contradiction": 0, "entailment": 1, "neutral": 2} train_samples = [ - InputExample(texts=['sentence1', 'sentence2'], label=label2int['neutral']), - InputExample(texts=['Another', 'pair'], label=label2int['entailment']), + InputExample(texts=["sentence1", "sentence2"], label=label2int["neutral"]), + InputExample(texts=["Another", "pair"], label=label2int["entailment"]), ] ``` @@ -39,11 +41,13 @@ For binary tasks and tasks with continuous scores (like STS), we set num_labels= We start the training by calling `model.fit()`: ```python -model.fit(train_dataloader=train_dataloader, - evaluator=evaluator, - epochs=num_epochs, - warmup_steps=warmup_steps, - output_path=model_save_path) +model.fit( + train_dataloader=train_dataloader, + evaluator=evaluator, + epochs=num_epochs, + warmup_steps=warmup_steps, + output_path=model_save_path, +) ``` diff --git a/examples/training/multilingual/README.md b/examples/training/multilingual/README.md index ba6fd3f28..98a2d706d 100644 --- a/examples/training/multilingual/README.md +++ b/examples/training/multilingual/README.md @@ -13,8 +13,9 @@ For a list of available models, see [Pretrained Models](https://www.sbert.net/do You can use the models in the following way: ```python from sentence_transformers import SentenceTransformer -embedder = SentenceTransformer('model-name') -embeddings = embedder.encode(['Hello World', 'Hallo Welt', 'Hola mundo']) + +embedder = SentenceTransformer("model-name") +embeddings = embedder.encode(["Hello World", "Hallo Welt", "Hola mundo"]) print(embeddings) ``` @@ -140,9 +141,9 @@ You can load such a training file using the *ParallelSentencesDataset* class: from sentence_transformers.datasets import ParallelSentencesDataset train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model) -train_data.load_data('path/to/tab/separated/train-en-de.tsv') -train_data.load_data('path/to/tab/separated/train-en-es.tsv.gz') -train_data.load_data('path/to/tab/separated/train-en-fr.tsv.gz') +train_data.load_data("path/to/tab/separated/train-en-de.tsv") +train_data.load_data("path/to/tab/separated/train-en-es.tsv.gz") +train_data.load_data("path/to/tab/separated/train-en-fr.tsv.gz") train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=student_model) @@ -169,7 +170,7 @@ You can measure the mean squared error (MSE) between the student embeddings and ```python # src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i] - dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, teacher_model=teacher_model) +dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, teacher_model=teacher_model) ``` This evaluator computes the teacher embeddings for the `src_sentences`, for example, for English. During training, the student model is used to compute embeddings for the `trg_sentences`, for example, for Spanish. The distance between teacher and student embeddings is measures. Lower scores indicate a better performance. @@ -181,7 +182,12 @@ For each sentence pair, we check if their embeddings are the closest using cosin ```python # src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i] -dev_trans_acc = evaluation.TranslationEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file),batch_size=inference_batch_size) +dev_trans_acc = evaluation.TranslationEvaluator( + src_sentences, + trg_sentences, + name=os.path.basename(dev_file), + batch_size=inference_batch_size, +) ``` ### Multi-Lingual Semantic Textual Similarity diff --git a/examples/training/quora_duplicate_questions/README.md b/examples/training/quora_duplicate_questions/README.md index 9ecc2c544..d61b04ac3 100644 --- a/examples/training/quora_duplicate_questions/README.md +++ b/examples/training/quora_duplicate_questions/README.md @@ -11,7 +11,8 @@ Currently the following models trained on Quora Duplicate Questions are availabl You can load & use pre-trained models like this: ```python from sentence_transformers import SentenceTransformer -model = SentenceTransformer('model_name') + +model = SentenceTransformer("model_name") ``` @@ -63,10 +64,15 @@ An improved version of contrastive loss is OnlineContrastiveLoss, which looks wh The loss can be used like this: ```python train_samples = [] -with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding='utf8') as fIn: - reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) +with open( + os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8" +) as fIn: + reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE) for row in reader: - sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate'])) + sample = InputExample( + texts=[row["question1"], row["question2"]], + label=int(row["is_duplicate"]), + ) train_samples.append(sample) @@ -95,12 +101,16 @@ MultipleNegativesRankingLoss now uses all *b_j* with j != i as negative example Using the loss is easy and does not require tuning of any hyperparameters: ```python train_samples = [] -with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding='utf8') as fIn: - reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) +with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn: + reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE) for row in reader: - if row['is_duplicate'] == '1': - train_samples.append(InputExample(texts=[row['question1'], row['question2']], label=1)) - train_samples.append(InputExample(texts=[row['question2'], row['question1']], label=1)) #if A is a duplicate of B, then B is a duplicate of A + if row["is_duplicate"] == "1": + train_samples.append( + InputExample(texts=[row["question1"], row["question2"]], label=1) + ) + train_samples.append( + InputExample(texts=[row["question2"], row["question1"]], label=1) + ) # if A is a duplicate of B, then B is a duplicate of A # After reading the train_samples, we create a SentencesDataset and a DataLoader @@ -125,32 +135,57 @@ In [training_multi-task-learning.py](training_multi-task-learning.py) I demonstr train_samples_MultipleNegativesRankingLoss = [] train_samples_ContrastiveLoss = [] -with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding='utf8') as fIn: - reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) +with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn: + reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE) for row in reader: - train_samples_ContrastiveLoss.append(InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))) - if row['is_duplicate'] == '1': - train_samples_MultipleNegativesRankingLoss.append(InputExample(texts=[row['question1'], row['question2']], label=1)) - train_samples_MultipleNegativesRankingLoss.append(InputExample(texts=[row['question2'], row['question1']], label=1)) # if A is a duplicate of B, then B is a duplicate of A + train_samples_ContrastiveLoss.append( + InputExample( + texts=[row["question1"], row["question2"]], + label=int(row["is_duplicate"]), + ) + ) + if row["is_duplicate"] == "1": + train_samples_MultipleNegativesRankingLoss.append( + InputExample(texts=[row["question1"], row["question2"]], label=1) + ) + train_samples_MultipleNegativesRankingLoss.append( + InputExample(texts=[row["question2"], row["question1"]], label=1) + ) # if A is a duplicate of B, then B is a duplicate of A # Create data loader and loss for MultipleNegativesRankingLoss -train_dataset_MultipleNegativesRankingLoss = SentencesDataset(train_samples_MultipleNegativesRankingLoss, model=model) -train_dataloader_MultipleNegativesRankingLoss = DataLoader(train_dataset_MultipleNegativesRankingLoss, shuffle=True, batch_size=train_batch_size) +train_dataset_MultipleNegativesRankingLoss = SentencesDataset( + train_samples_MultipleNegativesRankingLoss, model=model +) +train_dataloader_MultipleNegativesRankingLoss = DataLoader( + train_dataset_MultipleNegativesRankingLoss, + shuffle=True, + batch_size=train_batch_size, +) train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(model) # Create data loader and loss for OnlineContrastiveLoss -train_dataset_ConstrativeLoss = SentencesDataset(train_samples_ConstrativeLoss, model=model) -train_dataloader_ConstrativeLoss = DataLoader(train_dataset_ConstrativeLoss, shuffle=True, batch_size=train_batch_size) -train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin) +train_dataset_ConstrativeLoss = SentencesDataset( + train_samples_ConstrativeLoss, model=model +) +train_dataloader_ConstrativeLoss = DataLoader( + train_dataset_ConstrativeLoss, shuffle=True, batch_size=train_batch_size +) +train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss( + model=model, distance_metric=distance_metric, margin=margin +) # ..... # Train the model -model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss), (train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss)], - evaluator=seq_evaluator, - epochs=num_epochs, - warmup_steps=1000, - output_path=model_save_path - ) +model.fit( + train_objectives=[ + (train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss), + (train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss), + ], + evaluator=seq_evaluator, + epochs=num_epochs, + warmup_steps=1000, + output_path=model_save_path, +) ``` diff --git a/examples/training/sts/README.md b/examples/training/sts/README.md index 961296857..e95266da6 100644 --- a/examples/training/sts/README.md +++ b/examples/training/sts/README.md @@ -12,11 +12,18 @@ In STS, we have sentence pairs annotated together with a score indicating the si To store our training data, we create a list with `InputExample` objects. Each `InputExample` contains the sentence pair together with the label (score) that ranges between 0 - 1. A simplified version how the training data has to look like is the following: ```python -from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses - -model = SentenceTransformer('nli-distilroberta-base-v2') -train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8), - InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)] +from sentence_transformers import ( + SentenceTransformer, + SentencesDataset, + InputExample, + losses, +) + +model = SentenceTransformer("nli-distilroberta-base-v2") +train_examples = [ + InputExample(texts=["My first sentence", "My second sentence"], label=0.8), + InputExample(texts=["Another pair", "Unrelated sentence"], label=0.3), +] train_dataset = SentencesDataset(train_examples, model) ``` diff --git a/examples/unsupervised_learning/SimCSE/README.md b/examples/unsupervised_learning/SimCSE/README.md index cf657bac4..6b670de0b 100644 --- a/examples/unsupervised_learning/SimCSE/README.md +++ b/examples/unsupervised_learning/SimCSE/README.md @@ -14,16 +14,18 @@ from sentence_transformers import models, losses from torch.utils.data import DataLoader # Define your sentence transformer model using CLS pooling -model_name = 'distilroberta-base' +model_name = "distilroberta-base" word_embedding_model = models.Transformer(model_name, max_seq_length=32) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Define a list with sentences (1k - 100k sentences) -train_sentences = ["Your set of sentences", - "Model will automatically add the noise", - "And re-construct it", - "You should provide at least 1k sentences"] +train_sentences = [ + "Your set of sentences", + "Model will automatically add the noise", + "And re-construct it", + "You should provide at least 1k sentences", +] # Convert train sentences to sentence pairs train_data = [InputExample(texts=[s, s]) for s in train_sentences] @@ -36,12 +38,10 @@ train_loss = losses.MultipleNegativesRankingLoss(model) # Call the fit method model.fit( - train_objectives=[(train_dataloader, train_loss)], - epochs=1, - show_progress_bar=True + train_objectives=[(train_dataloader, train_loss)], epochs=1, show_progress_bar=True ) -model.save('output/simcse-model') +model.save("output/simcse-model") ``` ## SimCSE from Sentences File diff --git a/examples/unsupervised_learning/TSDAE/README.md b/examples/unsupervised_learning/TSDAE/README.md index 13b534bf5..163dbcc87 100644 --- a/examples/unsupervised_learning/TSDAE/README.md +++ b/examples/unsupervised_learning/TSDAE/README.md @@ -15,16 +15,18 @@ from sentence_transformers import models, util, datasets, evaluation, losses from torch.utils.data import DataLoader # Define your sentence transformer model using CLS pooling -model_name = 'bert-base-uncased' +model_name = "bert-base-uncased" word_embedding_model = models.Transformer(model_name) -pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls') +pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls") model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Define a list with sentences (1k - 100k sentences) -train_sentences = ["Your set of sentences", - "Model will automatically add the noise", - "And re-construct it", - "You should provide at least 1k sentences"] +train_sentences = [ + "Your set of sentences", + "Model will automatically add the noise", + "And re-construct it", + "You should provide at least 1k sentences", +] # Create the special denoising dataset that adds noise on-the-fly train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences) @@ -33,19 +35,21 @@ train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences) train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True) # Use the denoising auto-encoder loss -train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True) +train_loss = losses.DenoisingAutoEncoderLoss( + model, decoder_name_or_path=model_name, tie_encoder_decoder=True +) # Call the fit method model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=1, weight_decay=0, - scheduler='constantlr', - optimizer_params={'lr': 3e-5}, - show_progress_bar=True + scheduler="constantlr", + optimizer_params={"lr": 3e-5}, + show_progress_bar=True, ) -model.save('output/tsdae-model') +model.save("output/tsdae-model") ``` ## TSDAE from Sentences File diff --git a/examples/unsupervised_learning/query_generation/README.md b/examples/unsupervised_learning/query_generation/README.md index 74540eb39..de237a269 100644 --- a/examples/unsupervised_learning/query_generation/README.md +++ b/examples/unsupervised_learning/query_generation/README.md @@ -50,20 +50,21 @@ In [BeIR](https://huggingface.co/BeIR) we provide different models that can be u from transformers import T5Tokenizer, T5ForConditionalGeneration import torch -tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1') -model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1') +tokenizer = T5Tokenizer.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1") +model = T5ForConditionalGeneration.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1") model.eval() para = "Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects." -input_ids = tokenizer.encode(para, return_tensors='pt') +input_ids = tokenizer.encode(para, return_tensors="pt") with torch.no_grad(): outputs = model.generate( input_ids=input_ids, max_length=64, do_sample=True, top_p=0.95, - num_return_sequences=3) + num_return_sequences=3, + ) print("Paragraph:") print(para) @@ -71,7 +72,7 @@ print(para) print("\nGenerated Queries:") for i in range(len(outputs)): query = tokenizer.decode(outputs[i], skip_special_tokens=True) - print(f'{i + 1}: {query}') + print(f"{i + 1}: {query}") ``` In the above code, we use [Top-p (nucleus) sampling](https://huggingface.co/blog/how-to-generate) which will randomly pick a word from a collection of likely words. As a consequence, the model will generate different queries each time.