diff --git a/pyproject.toml b/pyproject.toml index 55348ee..3221ada 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,6 @@ pythonpath = [ ] [tool.mypy] -files="transformer_ranker,tests" +files="transformer_ranker" ignore_missing_imports = true check_untyped_defs = true diff --git a/requirements.txt b/requirements.txt index 6a57294..e47087d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ tokenizers torch torchmetrics tqdm -transformers \ No newline at end of file +transformers +scikit-learn \ No newline at end of file diff --git a/setup.py b/setup.py index d5bf07b..6f2f407 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def read_requirements(): name='transformer-ranker', version='0.1.0', packages=find_packages(), - description='Rank transformer models for NLP tasks using transferability measures', + description='Efficiently find the best-suited language model (LM) for your NLP task', long_description=open('README.md').read(), long_description_content_type="text/markdown", author='Lukas Garbas', diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..a0570bc --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,35 @@ +import pytest +import torch +from datasets import load_dataset +from sklearn import datasets +from transformers import AutoModel + + +@pytest.fixture(scope="session") +def small_language_models(): + """Use two tiny models for testing""" + return ( + AutoModel.from_pretrained("prajjwal1/bert-tiny"), + AutoModel.from_pretrained("google/electra-small-discriminator") + ) + + +@pytest.fixture(scope="session") +def conll(): + return load_dataset("conll2003") + + +@pytest.fixture(scope="session") +def trec(): + return load_dataset("trec") + + +@pytest.fixture(scope="session") +def iris_dataset(): + iris = datasets.load_iris() + data = torch.tensor(iris["data"], dtype=torch.float32) + data[142] += torch.tensor([0, 0, 0, 0.01]) # Ensure no exact duplicates + return { + "data": data, + "target": torch.tensor(iris["target"], dtype=torch.float32) + } diff --git a/tests/datasets/test_datacleaner.py b/tests/datasets/test_datacleaner.py deleted file mode 100644 index 2bf0f33..0000000 --- a/tests/datasets/test_datacleaner.py +++ /dev/null @@ -1,100 +0,0 @@ -import pytest -from datasets import load_dataset, DatasetDict, Dataset -from transformer_ranker.datacleaner import DatasetCleaner -import torch - -word_classification_datasets = [ - "conll2003", "wnut_17", "jnlpba", "ncbi_disease", - # More word classification (flat NER) datasets: - # "tner/ontonotes5", "levow/msra_ner" -] - -sentence_classification_datasets = [ - "trec", "stanfordnlp/sst2", "emotion", "hate_speech18", - # More text classification datasets: - # "osanseviero/twitter-airline-sentiment", "SetFit/ade_corpus_v2_classification", -] - - -def test_load_dataset(): - dataset_names = sentence_classification_datasets + word_classification_datasets - - for dataset in dataset_names: - try: - dataset = load_dataset(dataset, trust_remote_code=True) - except Exception as e: - pytest.fail( - "Huggingface loader failed on dataset %s with error: %s" % (dataset, e) - ) - - -def test_sentence_datasets_datacleaner(): - dataset_names = sentence_classification_datasets - - for dataset in dataset_names: - preprocessor = DatasetCleaner(merge_data_splits=True) - dataset = preprocessor.prepare_dataset(dataset) - - # Ensure that the prepared dataset is a Dataset - assert isinstance(dataset, Dataset) - - # Step 1: Make sure that task type is correctly found - assert preprocessor.task_type == "sentence classification" - - # Step 2.1: Make sure that text column was found - assert preprocessor.text_column is not None - - # Step 2.2: Make sure that at least one label column was found - assert preprocessor.label_column is not None - - # Step 3: Check that prepare_sentences returns a non-empty list of sentences - sentences = preprocessor.prepare_sentences(dataset) - assert isinstance(sentences, list) and len(sentences) > 0, ( - "prepare_sentences returned an empty list for dataset %s" % dataset - ) - assert all(isinstance(sentence, str) for sentence in sentences), ( - "prepare_sentences returned non-string or non-list elements for dataset %s" % dataset - ) - - # Step 4: Check that prepare_labels returns a non-empty torch.Tensor - labels = preprocessor.prepare_labels(dataset) - assert isinstance(labels, torch.Tensor) and labels.size(0) > 0, ( - "prepare_labels returned an empty tensor for dataset %s" % dataset - ) - - -def test_word_datasets_datacleaner(): - dataset_names = word_classification_datasets - - for dataset in dataset_names: - preprocessor = DatasetCleaner(merge_data_splits=True) - dataset = preprocessor.prepare_dataset(dataset) - - # Ensure that the prepared dataset is a Dataset or DatasetDict - assert isinstance(dataset, (DatasetDict, Dataset)) - - # Step 1: Make sure that task type is correctly found - assert preprocessor.task_type == "word classification" - - # Step 2.1: Make sure that text column was found - assert preprocessor.text_column is not None - - # Step 2.2: Make sure that label column was found - assert preprocessor.label_column is not None - - # Step 3: Check that prepare_sentences returns a non-empty list of word tokens - sentences = preprocessor.prepare_sentences(dataset) - assert isinstance(sentences, list) and len(sentences) > 0, ( - "prepare_sentences returned an empty list for dataset %s" % dataset - ) - assert all(isinstance(sentence, list) for sentence in sentences), ( - "prepare_sentences returned non-string or non-list elements for dataset %s" % dataset - ) - - # Step 4: Check that prepare_labels returns a non-empty torch.Tensor - labels = preprocessor.prepare_labels(dataset) - assert isinstance(labels, torch.Tensor) and labels.size(0) > 0, ( - "prepare_labels returned an empty tensor for dataset %s" % dataset - ) - -# pytest test_datacleaner.py -v diff --git a/tests/datasets/test_labels.py b/tests/datasets/test_labels.py deleted file mode 100644 index 44bb748..0000000 --- a/tests/datasets/test_labels.py +++ /dev/null @@ -1,48 +0,0 @@ -from datasets import load_dataset -from transformer_ranker.datacleaner import DatasetCleaner -import torch - - -def test_sentence_labels(): - sentence_dataset = "trec" - num_labels = 5952 - first_label = torch.tensor(2) - - dataset = load_dataset(sentence_dataset) - - handler = DatasetCleaner() - dataset = handler.prepare_dataset(dataset) - labels = handler.prepare_labels(dataset) - - # Check if labels were converted to a tensor - assert isinstance(labels, torch.Tensor) - - # Check if number of labels match - assert torch.Size([num_labels]) == labels.shape - - # Check if the first label is as expected - print(labels[0]) - assert torch.equal(first_label, labels[0]) - - -def test_word_labels(): - word_dataset = "conll2003" - num_labels = 301418 # total number of tokens (words) in the dataset - first_label = torch.Tensor([3, 0, 7, 0, 0, 0, 7, 0, 0]) - - dataset = load_dataset(word_dataset ) - - handler = DatasetCleaner() - dataset = handler.prepare_dataset(dataset) - labels = handler.prepare_labels(dataset) - - # Check if labels were converted to a tensor - assert isinstance(labels, torch.Tensor) - - # Check if number of labels match - assert torch.Size([num_labels]) == labels.shape - - # Check if the first subset of the label tensor matches - assert torch.equal(labels[:len(first_label)], first_label) - -# pytest test_labels.py -v diff --git a/tests/datasets/test_sentences.py b/tests/datasets/test_sentences.py deleted file mode 100644 index 8679a00..0000000 --- a/tests/datasets/test_sentences.py +++ /dev/null @@ -1,72 +0,0 @@ -from datasets import load_dataset, Dataset -from transformer_ranker.datacleaner import DatasetCleaner -from tokenizers.pre_tokenizers import Whitespace - - -def test_sentence_preprocessing(): - sentence_dataset = "trec" - dataset_size = 5952 - first_sentence = 'How did serfdom develop in and then leave Russia ?' - first_sentence_tokenized = ['How', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'Russia', '?'] - - dataset = load_dataset(sentence_dataset) - handler = DatasetCleaner() - preprocessed_dataset = handler.prepare_dataset(dataset) - - # Check if the dataset has only the relevant columns (text and label) - assert isinstance(preprocessed_dataset, Dataset) - assert handler.text_column in preprocessed_dataset.column_names - assert handler.label_column in preprocessed_dataset.column_names - assert len(preprocessed_dataset.column_names) == 2 - - # Check if the size is same after preprocessing - assert len(preprocessed_dataset) == dataset_size - - # Check if the first sentence in prepare sentences is still the same as original - sentences = handler.prepare_sentences(preprocessed_dataset) - assert sentences[0] == first_sentence - - handler = DatasetCleaner(pre_tokenizer=Whitespace()) - tokenized_dataset = handler.prepare_dataset(dataset) - - # Check if the size is same after preprocessing and tokenizing - assert len(tokenized_dataset) == dataset_size - - # Tokenization should affect sentence tasks - tokenized_sentences = handler.prepare_sentences(tokenized_dataset) - assert tokenized_sentences[0] == first_sentence_tokenized - - -def test_word_datasets_datahandler(): - word_dataset = "conll2003" - dataset_size = 20744 - first_sentence = ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'] - - dataset = load_dataset(word_dataset) - handler = DatasetCleaner() - preprocessed_dataset = handler.prepare_dataset(dataset) - - # Check if the dataset has only the relevant columns (text and label) - assert isinstance(preprocessed_dataset, Dataset) - assert handler.text_column in preprocessed_dataset.column_names - assert handler.label_column in preprocessed_dataset.column_names - assert len(preprocessed_dataset.column_names) == 2 - - # Check if the size is same after preprocessing - assert len(preprocessed_dataset) == dataset_size - - # Check if the first sentence in prepare sentences is still the same as original - sentences = handler.prepare_sentences(preprocessed_dataset) - assert sentences[0] == first_sentence - - handler = DatasetCleaner(pre_tokenizer=Whitespace()) - tokenized_dataset = handler.prepare_dataset(dataset) - - # Check if the size is same after preprocessing and tokenizing - assert len(tokenized_dataset) == dataset_size - - # Tokenization should not affect word classification tasks - tokenized_sentences = handler.prepare_sentences(tokenized_dataset) - assert tokenized_sentences[0] == first_sentence - -# pytest test_sentences.py -v diff --git a/tests/estimators/conftest.py b/tests/estimators/conftest.py deleted file mode 100644 index aa31203..0000000 --- a/tests/estimators/conftest.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest - -import torch - - -@pytest.fixture(scope="session") -def iris_dataset(): - from sklearn import datasets - - iris = datasets.load_iris() - - - data = torch.tensor(iris["data"], dtype=torch.float32) - data[142] += torch.tensor([0, 0, 0, 0.01]) # make duplicate element unique - - return { - "data": data, - "target": torch.tensor(iris["target"], dtype=torch.float32) - } diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..66a8b66 --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,86 @@ +from typing import List, Tuple, Type, Union + +import pytest +import torch +from datasets import Dataset +from transformer_ranker.datacleaner import DatasetCleaner + + +def load_datasets(dataset_type: str, num_datasets: Union[str, int] = "all") -> Tuple[List[str], Type, Type]: + """Try loading and preparing different datasets""" + dataset_map = { + 'token': ( + ["conll2003", "wnut_17"], + "token classification", list + ), + 'text': ( + ["trec", "stanfordnlp/sst2", "hate_speech18"], + "text classification", str + ), + 'text_pair': ( + ["yangwang825/sick", "SetFit/rte"], + "text classification", str + ) + } + + datasets, task_type, sentence_type = dataset_map[dataset_type] + if isinstance(num_datasets, int): + datasets = datasets[:num_datasets] + + return datasets, task_type, sentence_type + + +def validate_dataset( + preprocessor, + dataset_name: str, + dataset: Dataset, + expected_task_type: Type, + sentence_type: Type +): + assert isinstance(dataset, Dataset), f"Dataset '{dataset_name}' is not a valid Dataset object" + + assert preprocessor.task_type == expected_task_type, \ + (f"Task type mismatch: expected '{expected_task_type}', got '{preprocessor.task_type}'" + f"in dataset '{dataset_name}'") + + # Make sure text and label columns were found + assert preprocessor.text_column is not None, f"Text column not found in dataset {dataset_name}" + assert preprocessor.label_column is not None, f"Label column not found in dataset {dataset_name}" + + # Test texts in the text column + sentences = preprocessor.prepare_sentences(dataset) + assert isinstance(sentences, list) and len(sentences) > 0, ( + "Sentences/tokens list is empty in dataset %s", dataset_name + ) + assert all(isinstance(sentence, sentence_type) for sentence in sentences), \ + (f"Incorrect sentence/token type in dataset '{dataset_name}', all expected to be '{sentence_type}' " + f"but some sentences have different type") + + if sentence_type == str: + # For text and text pair classification, make sure there's no empty strings + assert all(sentence != "" for sentence in sentences), f"Empty sentence found in dataset {dataset_name}" + + if sentence_type == list: + # For token classification, make sure there is no empty tokens + assert all(sentence != [] for sentence in sentences), f"Empty token list found in dataset {dataset_name}" + # Check that no empty strings exist within the token lists + assert all(all(token != "" for token in sentence) for sentence in sentences), \ + f"Empty token found within a sentence in dataset {dataset_name}" + + # Test the label column in each dataset + labels = preprocessor.prepare_labels(dataset) + assert isinstance(labels, torch.Tensor) and labels.size(0) > 0, "Labels tensor is empty" + assert (labels >= 0).all(), f"Negative label found in dataset {dataset_name}" + + +@pytest.mark.parametrize("dataset_type", ["text", "token", "text_pair"]) +def test_datacleaner(dataset_type): + datasets, task_type, sentence_type = load_datasets(dataset_type, "all") + + # Loop through all test datasets, down sample them to 0.2 + for dataset_name in datasets: + preprocessor = DatasetCleaner(dataset_downsample=0.2) + dataset = preprocessor.prepare_dataset(dataset_name) + + # Test dataset preprocessing + validate_dataset(preprocessor, dataset_name, dataset, task_type, sentence_type) diff --git a/tests/test_embedder.py b/tests/test_embedder.py new file mode 100644 index 0000000..8dd1a59 --- /dev/null +++ b/tests/test_embedder.py @@ -0,0 +1,52 @@ +import pytest +from transformer_ranker import Embedder + + +test_sentences = [ + "this is a test sentence", + ["this", "is", "a", "test", "sentence"], + ["this is the first sentence.", "this is the second sentence."], + [["this", "is", "the", "first", "sentence", "."], ["this", "is", "the", "second", "sentence", "."]] +] + + +def test_embedder_inputs(small_language_models): + embeddings = { + 'prajjwal1/bert-tiny': [], + 'google/electra-small-discriminator': [] + } + + for model in small_language_models: + embedder = Embedder(model=model, layer_ids="all") + model_name = embedder.model_name + + for sentence in test_sentences: + embedding = embedder.embed(sentence) + embeddings[model_name].append(embedding) + + for model_name, sentence_embeddings in embeddings.items(): + for embedding in sentence_embeddings: + assert embedding is not None and embedding != [], f"Empty or None embedding found for model {model_name}" + + +def test_embedder_outputs(small_language_models): + for model in small_language_models: + embedder = Embedder(model=model, layer_ids="all") # test word-level embedder + model_name = embedder.model_name + num_layers = embedder.num_transformer_layers + embedding = embedder.embed("this is a test sentence")[0] # 5 words + + # Embedding dim should be 5 words x num_layers x hidden_size + assert embedding.shape[:2] == (5, num_layers), \ + f"Expected first two dimensions to be (5, {num_layers}), got {embedding.shape[:2]} using model {model_name}" + + for model in small_language_models: + embedder = Embedder(model=model, layer_ids="all", sentence_pooling="mean") # test sentence-level embedder + model_name = embedder.model_name + num_layers = embedder.num_transformer_layers + embedding = embedder.embed("this is a test sentence.")[0] + + # Embedding dim should be num_layers x hidden_size + assert embedding.shape[0] == num_layers, \ + (f"Expected to have a single sentence embedding with dim (1, hidden_size)" + f"but got {embedding.shape} using model {model_name}") diff --git a/tests/test_ranker.py b/tests/test_ranker.py new file mode 100644 index 0000000..92e73f5 --- /dev/null +++ b/tests/test_ranker.py @@ -0,0 +1,11 @@ +from transformer_ranker import TransformerRanker + + +def test_ranker_trec(small_language_models, trec): + ranker = TransformerRanker(dataset=trec, dataset_downsample=0.2) + ranker.run(models=small_language_models, batch_size=64) + + +def test_ranker_conll(small_language_models, conll): + ranker = TransformerRanker(dataset=conll, dataset_downsample=0.2) + ranker.run(models=small_language_models, batch_size=64)