Skip to content

Commit

Permalink
[model_card] Keep the model card readable even with many datasets (#…
Browse files Browse the repository at this point in the history
…3088)

* Keep the model card readable even with many datasets

* Avoid 2 consecutive empty lines in model card
  • Loading branch information
tomaarsen authored Nov 26, 2024
1 parent b055b5d commit e6a3dca
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 5 deletions.
12 changes: 7 additions & 5 deletions sentence_transformers/model_card_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# {{ model_name if model_name else "Sentence Transformer model" }}

This is a [sentence-transformers](https://www.SBERT.net) model{% if base_model %} finetuned from [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %} trained{% endif %}{% if train_datasets | selectattr("name") | list %} on the {% for dataset in (train_datasets | selectattr("name")) %}{% if dataset.id %}[{{ dataset.name if dataset.name else dataset.id }}](https://huggingface.co/datasets/{{ dataset.id }}){% else %}{{ dataset.name }}{% endif %}{% if not loop.last %}{% if loop.index == (train_datasets | selectattr("name") | list | length - 1) %} and {% else %}, {% endif %}{% endif %}{% endfor %} dataset{{"s" if train_datasets | selectattr("name") | list | length > 1 else ""}}{% endif %}. It maps sentences & paragraphs to a {{ output_dimensionality }}-dimensional dense vector space and can be used for {{ task_name }}.
This is a [sentence-transformers](https://www.SBERT.net) model{% if base_model %} finetuned from [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %} trained{% endif %}{% if train_datasets | selectattr("name") | list %} on {% if train_datasets | selectattr("name") | map(attribute="name") | join(", ") | length > 200 %}{{ train_datasets | length }}{% else %}the {% for dataset in (train_datasets | selectattr("name")) %}{% if dataset.id %}[{{ dataset.name if dataset.name else dataset.id }}](https://huggingface.co/datasets/{{ dataset.id }}){% else %}{{ dataset.name }}{% endif %}{% if not loop.last %}{% if loop.index == (train_datasets | selectattr("name") | list | length - 1) %} and {% else %}, {% endif %}{% endif %}{% endfor %}{% endif %} dataset{{"s" if train_datasets | selectattr("name") | list | length > 1 else ""}}{% endif %}. It maps sentences & paragraphs to a {{ output_dimensionality }}-dimensional dense vector space and can be used for {{ task_name }}.

## Model Details

Expand Down Expand Up @@ -156,18 +156,20 @@ You can finetune this model on your own dataset.
## Training Details
{% for dataset_type, dataset_list in [("training", train_datasets), ("evaluation", eval_datasets)] %}{% if dataset_list %}
### {{ dataset_type.title() }} Dataset{{"s" if dataset_list | length > 1 else ""}}
{% for dataset in dataset_list %}
{% for dataset in dataset_list %}{% if dataset_list | length > 3 %}<details><summary>{{ dataset['name'] or 'Unnamed Dataset' }}</summary>
{% endif %}
#### {{ dataset['name'] or 'Unnamed Dataset' }}

{% if dataset['name'] %}* Dataset: {% if 'id' in dataset %}[{{ dataset['name'] }}](https://huggingface.co/datasets/{{ dataset['id'] }}){% else %}{{ dataset['name'] }}{% endif %}
{% if dataset['name'] %}
* Dataset: {% if 'id' in dataset %}[{{ dataset['name'] }}](https://huggingface.co/datasets/{{ dataset['id'] }}){% else %}{{ dataset['name'] }}{% endif %}
{%- if 'revision' in dataset and 'id' in dataset %} at [{{ dataset['revision'][:7] }}](https://huggingface.co/datasets/{{ dataset['id'] }}/tree/{{ dataset['revision'] }}){% endif %}{% endif %}
{% if dataset['size'] %}* Size: {{ "{:,}".format(dataset['size']) }} {{ dataset_type }} samples
{% endif %}* Columns: {% if dataset['columns'] | length == 1 %}{{ dataset['columns'][0] }}{% elif dataset['columns'] | length == 2 %}{{ dataset['columns'][0] }} and {{ dataset['columns'][1] }}{% else %}{{ dataset['columns'][:-1] | join(', ') }}, and {{ dataset['columns'][-1] }}{% endif %}
{% if dataset['stats_table'] %}* Approximate statistics based on the first {{ [dataset['size'], 1000] | min }} samples:
{{ dataset['stats_table'] }}{% endif %}{% if dataset['examples_table'] %}* Samples:
{{ dataset['examples_table'] }}{% endif %}* Loss: {% if dataset["loss"]["fullname"].startswith("sentence_transformers.") %}[<code>{{ dataset["loss"]["fullname"].split(".")[-1] }}</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#{{ dataset["loss"]["fullname"].split(".")[-1].lower() }}){% else %}<code>{{ dataset["loss"]["fullname"] }}</code>{% endif %}{% if "config_code" in dataset["loss"] %} with these parameters:
{{ dataset["loss"]["config_code"] }}{% endif %}
{% endfor %}{% endif %}{% endfor -%}
{% if dataset_list | length > 3 %}</details>
{% endif %}{% endfor %}{% endif %}{% endfor -%}

{% if all_hyperparameters %}
### Training Hyperparameters
Expand Down
111 changes: 111 additions & 0 deletions tests/test_model_card.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from __future__ import annotations

import pytest
from datasets import Dataset, DatasetDict

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers.model_card import generate_model_card


@pytest.fixture(scope="session")
def dummy_dataset():
"""
Dummy dataset for testing purposes. The dataset looks as follows:
{
"anchor": ["anchor 1", "anchor 2", ..., "anchor 10"],
"positive": ["positive 1", "positive 2", ..., "positive 10"],
"negative": ["negative 1", "negative 2", ..., "negative 10"],
}
"""
return Dataset.from_dict(
{
"anchor": [f"anchor {i}" for i in range(1, 11)],
"positive": [f"positive {i}" for i in range(1, 11)],
"negative": [f"negative {i}" for i in range(1, 11)],
}
)


@pytest.mark.parametrize(
("num_datasets", "expected_substrings"),
[
# 0 actually refers to just a single dataset
(
0,
[
"This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors).",
"**Maximum Sequence Length:** 512 tokens",
"**Output Dimensionality:** 128 dimensions",
"**Similarity Function:** Cosine Similarity",
"#### Unnamed Dataset",
" | <code>anchor 1</code> | <code>positive 1</code> | <code>negative 1</code> |",
"* Loss: [<code>CoSENTLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters:",
],
),
(
1,
[
"This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0 dataset.",
"#### train_0",
],
),
(
2,
[
"This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0 and train_1 datasets.",
"#### train_0",
"#### train_1",
],
),
(
10,
[
"This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets.",
"<details><summary>train_0</summary>", # We start using <details><summary> if we have more than 3 datasets
"#### train_0",
"</details>\n<details><summary>train_9</summary>",
"#### train_9",
],
),
# We start using "50 datasets" when the ", "-joined dataset name exceed 200 characters
(
50,
[
"This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on 50 datasets.",
"<details><summary>train_0</summary>",
"#### train_0",
"</details>\n<details><summary>train_49</summary>",
"#### train_49",
],
),
],
)
def test_model_card_base(
stsb_bert_tiny_model: SentenceTransformer,
dummy_dataset: Dataset,
num_datasets: int,
expected_substrings: list[str],
) -> None:
model = stsb_bert_tiny_model

train_dataset = dummy_dataset
if num_datasets:
train_dataset = DatasetDict({f"train_{i}": train_dataset for i in range(num_datasets)})

# This adds data to model.model_card_data
SentenceTransformerTrainer(
model,
train_dataset=train_dataset,
)

model_card = generate_model_card(model)

# For debugging purposes, we save the model card to a file
# with open(f"test_model_card_{num_datasets}.md", "w", encoding="utf8") as f:
# f.write(model_card)

for substring in expected_substrings:
assert substring in model_card

# We don't want to have two consecutive empty lines anywhere
assert "\n\n\n" not in model_card

0 comments on commit e6a3dca

Please sign in to comment.