From e6a3dca6970076e5dd709e3e0e8e9a6e11167390 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Tue, 26 Nov 2024 15:13:20 +0100
Subject: [PATCH] [`model_card`] Keep the model card readable even with many
datasets (#3088)
* Keep the model card readable even with many datasets
* Avoid 2 consecutive empty lines in model card
---
sentence_transformers/model_card_template.md | 12 +-
tests/test_model_card.py | 111 +++++++++++++++++++
2 files changed, 118 insertions(+), 5 deletions(-)
create mode 100644 tests/test_model_card.py
diff --git a/sentence_transformers/model_card_template.md b/sentence_transformers/model_card_template.md
index ae428d3fe..5bec7cf27 100644
--- a/sentence_transformers/model_card_template.md
+++ b/sentence_transformers/model_card_template.md
@@ -6,7 +6,7 @@
# {{ model_name if model_name else "Sentence Transformer model" }}
-This is a [sentence-transformers](https://www.SBERT.net) model{% if base_model %} finetuned from [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %} trained{% endif %}{% if train_datasets | selectattr("name") | list %} on the {% for dataset in (train_datasets | selectattr("name")) %}{% if dataset.id %}[{{ dataset.name if dataset.name else dataset.id }}](https://huggingface.co/datasets/{{ dataset.id }}){% else %}{{ dataset.name }}{% endif %}{% if not loop.last %}{% if loop.index == (train_datasets | selectattr("name") | list | length - 1) %} and {% else %}, {% endif %}{% endif %}{% endfor %} dataset{{"s" if train_datasets | selectattr("name") | list | length > 1 else ""}}{% endif %}. It maps sentences & paragraphs to a {{ output_dimensionality }}-dimensional dense vector space and can be used for {{ task_name }}.
+This is a [sentence-transformers](https://www.SBERT.net) model{% if base_model %} finetuned from [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %} trained{% endif %}{% if train_datasets | selectattr("name") | list %} on {% if train_datasets | selectattr("name") | map(attribute="name") | join(", ") | length > 200 %}{{ train_datasets | length }}{% else %}the {% for dataset in (train_datasets | selectattr("name")) %}{% if dataset.id %}[{{ dataset.name if dataset.name else dataset.id }}](https://huggingface.co/datasets/{{ dataset.id }}){% else %}{{ dataset.name }}{% endif %}{% if not loop.last %}{% if loop.index == (train_datasets | selectattr("name") | list | length - 1) %} and {% else %}, {% endif %}{% endif %}{% endfor %}{% endif %} dataset{{"s" if train_datasets | selectattr("name") | list | length > 1 else ""}}{% endif %}. It maps sentences & paragraphs to a {{ output_dimensionality }}-dimensional dense vector space and can be used for {{ task_name }}.
## Model Details
@@ -156,10 +156,11 @@ You can finetune this model on your own dataset.
## Training Details
{% for dataset_type, dataset_list in [("training", train_datasets), ("evaluation", eval_datasets)] %}{% if dataset_list %}
### {{ dataset_type.title() }} Dataset{{"s" if dataset_list | length > 1 else ""}}
-{% for dataset in dataset_list %}
+{% for dataset in dataset_list %}{% if dataset_list | length > 3 %}{{ dataset['name'] or 'Unnamed Dataset' }}
+{% endif %}
#### {{ dataset['name'] or 'Unnamed Dataset' }}
-
-{% if dataset['name'] %}* Dataset: {% if 'id' in dataset %}[{{ dataset['name'] }}](https://huggingface.co/datasets/{{ dataset['id'] }}){% else %}{{ dataset['name'] }}{% endif %}
+{% if dataset['name'] %}
+* Dataset: {% if 'id' in dataset %}[{{ dataset['name'] }}](https://huggingface.co/datasets/{{ dataset['id'] }}){% else %}{{ dataset['name'] }}{% endif %}
{%- if 'revision' in dataset and 'id' in dataset %} at [{{ dataset['revision'][:7] }}](https://huggingface.co/datasets/{{ dataset['id'] }}/tree/{{ dataset['revision'] }}){% endif %}{% endif %}
{% if dataset['size'] %}* Size: {{ "{:,}".format(dataset['size']) }} {{ dataset_type }} samples
{% endif %}* Columns: {% if dataset['columns'] | length == 1 %}{{ dataset['columns'][0] }}{% elif dataset['columns'] | length == 2 %}{{ dataset['columns'][0] }} and {{ dataset['columns'][1] }}{% else %}{{ dataset['columns'][:-1] | join(', ') }}, and {{ dataset['columns'][-1] }}{% endif %}
@@ -167,7 +168,8 @@ You can finetune this model on your own dataset.
{{ dataset['stats_table'] }}{% endif %}{% if dataset['examples_table'] %}* Samples:
{{ dataset['examples_table'] }}{% endif %}* Loss: {% if dataset["loss"]["fullname"].startswith("sentence_transformers.") %}[{{ dataset["loss"]["fullname"].split(".")[-1] }}
](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#{{ dataset["loss"]["fullname"].split(".")[-1].lower() }}){% else %}{{ dataset["loss"]["fullname"] }}
{% endif %}{% if "config_code" in dataset["loss"] %} with these parameters:
{{ dataset["loss"]["config_code"] }}{% endif %}
-{% endfor %}{% endif %}{% endfor -%}
+{% if dataset_list | length > 3 %}
+{% endif %}{% endfor %}{% endif %}{% endfor -%}
{% if all_hyperparameters %}
### Training Hyperparameters
diff --git a/tests/test_model_card.py b/tests/test_model_card.py
new file mode 100644
index 000000000..e86cd1462
--- /dev/null
+++ b/tests/test_model_card.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+import pytest
+from datasets import Dataset, DatasetDict
+
+from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
+from sentence_transformers.model_card import generate_model_card
+
+
+@pytest.fixture(scope="session")
+def dummy_dataset():
+ """
+ Dummy dataset for testing purposes. The dataset looks as follows:
+ {
+ "anchor": ["anchor 1", "anchor 2", ..., "anchor 10"],
+ "positive": ["positive 1", "positive 2", ..., "positive 10"],
+ "negative": ["negative 1", "negative 2", ..., "negative 10"],
+ }
+ """
+ return Dataset.from_dict(
+ {
+ "anchor": [f"anchor {i}" for i in range(1, 11)],
+ "positive": [f"positive {i}" for i in range(1, 11)],
+ "negative": [f"negative {i}" for i in range(1, 11)],
+ }
+ )
+
+
+@pytest.mark.parametrize(
+ ("num_datasets", "expected_substrings"),
+ [
+ # 0 actually refers to just a single dataset
+ (
+ 0,
+ [
+ "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors).",
+ "**Maximum Sequence Length:** 512 tokens",
+ "**Output Dimensionality:** 128 dimensions",
+ "**Similarity Function:** Cosine Similarity",
+ "#### Unnamed Dataset",
+ " | anchor 1
| positive 1
| negative 1
|",
+ "* Loss: [CoSENTLoss
](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters:",
+ ],
+ ),
+ (
+ 1,
+ [
+ "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0 dataset.",
+ "#### train_0",
+ ],
+ ),
+ (
+ 2,
+ [
+ "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0 and train_1 datasets.",
+ "#### train_0",
+ "#### train_1",
+ ],
+ ),
+ (
+ 10,
+ [
+ "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets.",
+ "train_0
", # We start using if we have more than 3 datasets
+ "#### train_0",
+ "
\ntrain_9
",
+ "#### train_9",
+ ],
+ ),
+ # We start using "50 datasets" when the ", "-joined dataset name exceed 200 characters
+ (
+ 50,
+ [
+ "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on 50 datasets.",
+ "train_0
",
+ "#### train_0",
+ " \ntrain_49
",
+ "#### train_49",
+ ],
+ ),
+ ],
+)
+def test_model_card_base(
+ stsb_bert_tiny_model: SentenceTransformer,
+ dummy_dataset: Dataset,
+ num_datasets: int,
+ expected_substrings: list[str],
+) -> None:
+ model = stsb_bert_tiny_model
+
+ train_dataset = dummy_dataset
+ if num_datasets:
+ train_dataset = DatasetDict({f"train_{i}": train_dataset for i in range(num_datasets)})
+
+ # This adds data to model.model_card_data
+ SentenceTransformerTrainer(
+ model,
+ train_dataset=train_dataset,
+ )
+
+ model_card = generate_model_card(model)
+
+ # For debugging purposes, we save the model card to a file
+ # with open(f"test_model_card_{num_datasets}.md", "w", encoding="utf8") as f:
+ # f.write(model_card)
+
+ for substring in expected_substrings:
+ assert substring in model_card
+
+ # We don't want to have two consecutive empty lines anywhere
+ assert "\n\n\n" not in model_card