From e6a3dca6970076e5dd709e3e0e8e9a6e11167390 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:13:20 +0100 Subject: [PATCH] [`model_card`] Keep the model card readable even with many datasets (#3088) * Keep the model card readable even with many datasets * Avoid 2 consecutive empty lines in model card --- sentence_transformers/model_card_template.md | 12 +- tests/test_model_card.py | 111 +++++++++++++++++++ 2 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 tests/test_model_card.py diff --git a/sentence_transformers/model_card_template.md b/sentence_transformers/model_card_template.md index ae428d3fe..5bec7cf27 100644 --- a/sentence_transformers/model_card_template.md +++ b/sentence_transformers/model_card_template.md @@ -6,7 +6,7 @@ # {{ model_name if model_name else "Sentence Transformer model" }} -This is a [sentence-transformers](https://www.SBERT.net) model{% if base_model %} finetuned from [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %} trained{% endif %}{% if train_datasets | selectattr("name") | list %} on the {% for dataset in (train_datasets | selectattr("name")) %}{% if dataset.id %}[{{ dataset.name if dataset.name else dataset.id }}](https://huggingface.co/datasets/{{ dataset.id }}){% else %}{{ dataset.name }}{% endif %}{% if not loop.last %}{% if loop.index == (train_datasets | selectattr("name") | list | length - 1) %} and {% else %}, {% endif %}{% endif %}{% endfor %} dataset{{"s" if train_datasets | selectattr("name") | list | length > 1 else ""}}{% endif %}. It maps sentences & paragraphs to a {{ output_dimensionality }}-dimensional dense vector space and can be used for {{ task_name }}. +This is a [sentence-transformers](https://www.SBERT.net) model{% if base_model %} finetuned from [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %} trained{% endif %}{% if train_datasets | selectattr("name") | list %} on {% if train_datasets | selectattr("name") | map(attribute="name") | join(", ") | length > 200 %}{{ train_datasets | length }}{% else %}the {% for dataset in (train_datasets | selectattr("name")) %}{% if dataset.id %}[{{ dataset.name if dataset.name else dataset.id }}](https://huggingface.co/datasets/{{ dataset.id }}){% else %}{{ dataset.name }}{% endif %}{% if not loop.last %}{% if loop.index == (train_datasets | selectattr("name") | list | length - 1) %} and {% else %}, {% endif %}{% endif %}{% endfor %}{% endif %} dataset{{"s" if train_datasets | selectattr("name") | list | length > 1 else ""}}{% endif %}. It maps sentences & paragraphs to a {{ output_dimensionality }}-dimensional dense vector space and can be used for {{ task_name }}. ## Model Details @@ -156,10 +156,11 @@ You can finetune this model on your own dataset. ## Training Details {% for dataset_type, dataset_list in [("training", train_datasets), ("evaluation", eval_datasets)] %}{% if dataset_list %} ### {{ dataset_type.title() }} Dataset{{"s" if dataset_list | length > 1 else ""}} -{% for dataset in dataset_list %} +{% for dataset in dataset_list %}{% if dataset_list | length > 3 %}
{{ dataset['name'] or 'Unnamed Dataset' }} +{% endif %} #### {{ dataset['name'] or 'Unnamed Dataset' }} - -{% if dataset['name'] %}* Dataset: {% if 'id' in dataset %}[{{ dataset['name'] }}](https://huggingface.co/datasets/{{ dataset['id'] }}){% else %}{{ dataset['name'] }}{% endif %} +{% if dataset['name'] %} +* Dataset: {% if 'id' in dataset %}[{{ dataset['name'] }}](https://huggingface.co/datasets/{{ dataset['id'] }}){% else %}{{ dataset['name'] }}{% endif %} {%- if 'revision' in dataset and 'id' in dataset %} at [{{ dataset['revision'][:7] }}](https://huggingface.co/datasets/{{ dataset['id'] }}/tree/{{ dataset['revision'] }}){% endif %}{% endif %} {% if dataset['size'] %}* Size: {{ "{:,}".format(dataset['size']) }} {{ dataset_type }} samples {% endif %}* Columns: {% if dataset['columns'] | length == 1 %}{{ dataset['columns'][0] }}{% elif dataset['columns'] | length == 2 %}{{ dataset['columns'][0] }} and {{ dataset['columns'][1] }}{% else %}{{ dataset['columns'][:-1] | join(', ') }}, and {{ dataset['columns'][-1] }}{% endif %} @@ -167,7 +168,8 @@ You can finetune this model on your own dataset. {{ dataset['stats_table'] }}{% endif %}{% if dataset['examples_table'] %}* Samples: {{ dataset['examples_table'] }}{% endif %}* Loss: {% if dataset["loss"]["fullname"].startswith("sentence_transformers.") %}[{{ dataset["loss"]["fullname"].split(".")[-1] }}](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#{{ dataset["loss"]["fullname"].split(".")[-1].lower() }}){% else %}{{ dataset["loss"]["fullname"] }}{% endif %}{% if "config_code" in dataset["loss"] %} with these parameters: {{ dataset["loss"]["config_code"] }}{% endif %} -{% endfor %}{% endif %}{% endfor -%} +{% if dataset_list | length > 3 %}
+{% endif %}{% endfor %}{% endif %}{% endfor -%} {% if all_hyperparameters %} ### Training Hyperparameters diff --git a/tests/test_model_card.py b/tests/test_model_card.py new file mode 100644 index 000000000..e86cd1462 --- /dev/null +++ b/tests/test_model_card.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import pytest +from datasets import Dataset, DatasetDict + +from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer +from sentence_transformers.model_card import generate_model_card + + +@pytest.fixture(scope="session") +def dummy_dataset(): + """ + Dummy dataset for testing purposes. The dataset looks as follows: + { + "anchor": ["anchor 1", "anchor 2", ..., "anchor 10"], + "positive": ["positive 1", "positive 2", ..., "positive 10"], + "negative": ["negative 1", "negative 2", ..., "negative 10"], + } + """ + return Dataset.from_dict( + { + "anchor": [f"anchor {i}" for i in range(1, 11)], + "positive": [f"positive {i}" for i in range(1, 11)], + "negative": [f"negative {i}" for i in range(1, 11)], + } + ) + + +@pytest.mark.parametrize( + ("num_datasets", "expected_substrings"), + [ + # 0 actually refers to just a single dataset + ( + 0, + [ + "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors).", + "**Maximum Sequence Length:** 512 tokens", + "**Output Dimensionality:** 128 dimensions", + "**Similarity Function:** Cosine Similarity", + "#### Unnamed Dataset", + " | anchor 1 | positive 1 | negative 1 |", + "* Loss: [CoSENTLoss](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters:", + ], + ), + ( + 1, + [ + "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0 dataset.", + "#### train_0", + ], + ), + ( + 2, + [ + "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0 and train_1 datasets.", + "#### train_0", + "#### train_1", + ], + ), + ( + 10, + [ + "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets.", + "
train_0", # We start using
if we have more than 3 datasets + "#### train_0", + "
\n
train_9", + "#### train_9", + ], + ), + # We start using "50 datasets" when the ", "-joined dataset name exceed 200 characters + ( + 50, + [ + "This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on 50 datasets.", + "
train_0", + "#### train_0", + "
\n
train_49", + "#### train_49", + ], + ), + ], +) +def test_model_card_base( + stsb_bert_tiny_model: SentenceTransformer, + dummy_dataset: Dataset, + num_datasets: int, + expected_substrings: list[str], +) -> None: + model = stsb_bert_tiny_model + + train_dataset = dummy_dataset + if num_datasets: + train_dataset = DatasetDict({f"train_{i}": train_dataset for i in range(num_datasets)}) + + # This adds data to model.model_card_data + SentenceTransformerTrainer( + model, + train_dataset=train_dataset, + ) + + model_card = generate_model_card(model) + + # For debugging purposes, we save the model card to a file + # with open(f"test_model_card_{num_datasets}.md", "w", encoding="utf8") as f: + # f.write(model_card) + + for substring in expected_substrings: + assert substring in model_card + + # We don't want to have two consecutive empty lines anywhere + assert "\n\n\n" not in model_card