diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3bb039f..0a825a7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,7 +23,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8"] + python-version: ["3.9"] steps: - name: Checkout the repository diff --git a/examples/code_examples/chunking.py b/examples/code_examples/chunking.py index beaedf0..9eb8320 100644 --- a/examples/code_examples/chunking.py +++ b/examples/code_examples/chunking.py @@ -2,19 +2,18 @@ from transformer_ranker import TransformerRanker # Load the 'conll2003' dataset -dataset = load_dataset('conll2003') +dataset = load_dataset("conll2003") -# Use smaller models to test on CPU -models = ['prajjwal1/bert-tiny', - 'google/electra-small-discriminator', - 'microsoft/deberta-v3-small', - 'bert-base-uncased', - ] +# Use smaller models to run on CPU +models = [ + "prajjwal1/bert-tiny", + "google/electra-small-discriminator", + "microsoft/deberta-v3-small", + "bert-base-uncased", +] # Initialize the ranker, set labels to chunk tags -ranker = TransformerRanker(dataset=dataset, - dataset_downsample=0.2, - label_column='chunk_tags') +ranker = TransformerRanker(dataset=dataset, dataset_downsample=0.2, label_column="chunk_tags") # ... and run it result = ranker.run(models=models, batch_size=64) diff --git a/examples/code_examples/classification.py b/examples/code_examples/classification.py index df37e74..1a4e214 100644 --- a/examples/code_examples/classification.py +++ b/examples/code_examples/classification.py @@ -2,21 +2,23 @@ from transformer_ranker import TransformerRanker # Load and inspect the 'trec' dataset -dataset = load_dataset('trec') +dataset = load_dataset("trec") print(dataset) # Use smaller models to run on CPU -language_models = ['prajjwal1/bert-tiny', - 'google/electra-small-discriminator', - 'microsoft/deberta-v3-small', - 'bert-base-uncased', - ] +language_models = [ + "prajjwal1/bert-tiny", + "google/electra-small-discriminator", + "microsoft/deberta-v3-small", + "bert-base-uncased", +] # Initialize the ranker -ranker = TransformerRanker(dataset=dataset, - dataset_downsample=0.2, - label_column="coarse_label", - ) +ranker = TransformerRanker( + dataset=dataset, + dataset_downsample=0.2, + label_column="coarse_label", +) # ... and run it result = ranker.run(models=language_models, batch_size=32) diff --git a/examples/code_examples/entailment.py b/examples/code_examples/entailment.py index 6b49253..f4601a0 100644 --- a/examples/code_examples/entailment.py +++ b/examples/code_examples/entailment.py @@ -2,16 +2,17 @@ from transformer_ranker import TransformerRanker # Load 'rte' Recognizing Textual Entailment dataset -entailment_dataset = load_dataset('glue', 'rte') +entailment_dataset = load_dataset("glue", "rte") # Use smaller models to run on CPU -language_models = ['prajjwal1/bert-tiny', - 'google/electra-small-discriminator', - 'microsoft/deberta-v3-small', - 'bert-base-uncased', - ] +language_models = [ + "prajjwal1/bert-tiny", + "google/electra-small-discriminator", + "microsoft/deberta-v3-small", + "bert-base-uncased", +] -# Initialize the ranker, set text_pair_column +# Initialize the ranker, set column for text pairs ranker = TransformerRanker(dataset=entailment_dataset, text_pair_column="sentence2") # ... and run it diff --git a/examples/code_examples/multiple_runs.py b/examples/code_examples/multiple_runs.py index a0c1849..192347b 100644 --- a/examples/code_examples/multiple_runs.py +++ b/examples/code_examples/multiple_runs.py @@ -2,18 +2,18 @@ from transformer_ranker import TransformerRanker # Load a dataset, initialize the ranker -dataset = load_dataset('trec') +dataset = load_dataset("trec") ranker = TransformerRanker(dataset=dataset, dataset_downsample=0.2) # Load smaller models -models = ['prajjwal1/bert-tiny', 'google/electra-small-discriminator'] +models = ["prajjwal1/bert-tiny", "google/electra-small-discriminator"] # ... and rank them using a large batch size result = ranker.run(models=models, batch_size=124) print(result) # Add larger models -models = ['bert-large-cased', 'google/electra-large-discriminator'] +models = ["bert-large-cased", "google/electra-large-discriminator"] # ... and rank them using a small batch size result.append(ranker.run(models=models, batch_size=16)) diff --git a/examples/code_examples/regression.py b/examples/code_examples/regression.py index afafe2e..e0e4f05 100644 --- a/examples/code_examples/regression.py +++ b/examples/code_examples/regression.py @@ -2,14 +2,15 @@ from transformer_ranker import TransformerRanker # Load a regression dataset -regression_dataset = load_dataset('glue', 'stsb') +regression_dataset = load_dataset("glue", "stsb") -# You can test on cpu using smaller models -models = ['prajjwal1/bert-tiny', - 'google/electra-small-discriminator', - 'microsoft/deberta-v3-small', - 'bert-base-uncased', - ] +# Use smaller models to run on CPU +models = [ + "prajjwal1/bert-tiny", + "google/electra-small-discriminator", + "microsoft/deberta-v3-small", + "bert-base-uncased", +] # Initialize the ranker, set the text pair column ranker = TransformerRanker(dataset=regression_dataset, text_pair_column="sentence2") diff --git a/examples/code_examples/tagging.py b/examples/code_examples/tagging.py index 00f7a3c..ac85ab3 100644 --- a/examples/code_examples/tagging.py +++ b/examples/code_examples/tagging.py @@ -2,19 +2,18 @@ from transformer_ranker import TransformerRanker # Load the WNUT-17 NER dataset of English tweets -dataset_ner = load_dataset('leondz/wnut_17') +dataset_ner = load_dataset("leondz/wnut_17") # Use smaller models to test on CPU -models = ['prajjwal1/bert-tiny', - 'google/electra-small-discriminator', - 'microsoft/deberta-v3-small', - 'bert-base-uncased', - ] +models = [ + "prajjwal1/bert-tiny", + "google/electra-small-discriminator", + "microsoft/deberta-v3-small", + "bert-base-uncased", +] # Initialize the ranker, set labels to ner tags -ranker = TransformerRanker(dataset=dataset_ner, - dataset_downsample=0.2, - label_column='ner_tags') +ranker = TransformerRanker(dataset=dataset_ner, dataset_downsample=0.2, label_column="ner_tags") # ... and run it result = ranker.run(models=models, batch_size=64) diff --git a/setup.py b/setup.py index f66d244..625d6a9 100644 --- a/setup.py +++ b/setup.py @@ -18,5 +18,5 @@ def read_requirements(): url="https://github.com/flairNLP/transformer-ranker", install_requires=read_requirements(), license='MIT', - python_requires=">=3.8", + python_requires=">=3.9", ) diff --git a/transformer_ranker/datacleaner.py b/transformer_ranker/datacleaner.py index 79a4065..baf06f6 100644 --- a/transformer_ranker/datacleaner.py +++ b/transformer_ranker/datacleaner.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, List, Optional, Tuple, Type, Union +from typing import Optional, Type, Union import datasets import torch @@ -22,7 +22,7 @@ def __init__( task_type: Optional[str] = None, text_column: Optional[str] = None, label_column: Optional[str] = None, - label_map: Optional[Dict[str, int]] = None, + label_map: Optional[dict[str, int]] = None, text_pair_column: Optional[str] = None, ): """ @@ -34,7 +34,7 @@ def __init__( :param change_bio_encoding: Convert BIO to single-class labels, removing B-, I-, O- prefix. :param remove_empty_sentences: Whether to remove empty sentences. :param dataset_downsample: Fraction to reduce the dataset size. - :param task_type: Task category "token classification", "text classification", "text regression". + :param task_type: "token classification", "text classification", or "text regression". :param text_column: Column name for texts. :param label_column: Column name for labels. :param label_map: A dictionary which maps label names to integers. @@ -107,7 +107,7 @@ def prepare_dataset( ) # Convert string labels to integers - if label_type == str: + if isinstance(label_type, str): dataset, self.label_map = self._make_labels_categorical(dataset, label_column) # Try to find label map in the dataset @@ -120,7 +120,10 @@ def prepare_dataset( dataset, label_column, self.label_map ) - logger.info("Label map: %s", self.label_map) + # Keep only text and label columns + keep_columns = {text_column, self.text_pair_column, label_column} - {None} + columns_to_remove = list(set(dataset.column_names) - keep_columns) + dataset = dataset.remove_columns(columns_to_remove) # Set updated attributes and log them self.text_column = text_column @@ -129,11 +132,6 @@ def prepare_dataset( self.dataset_size = len(dataset) self.log_dataset_info() - # Keep only text and label columns - keep_columns = {self.text_column, self.text_pair_column, self.label_column} - {None} - columns_to_remove = list(set(dataset.column_names) - keep_columns) - dataset = dataset.remove_columns(columns_to_remove) - return dataset def prepare_labels(self, dataset: Dataset) -> torch.Tensor: @@ -147,7 +145,7 @@ def prepare_labels(self, dataset: Dataset) -> torch.Tensor: ) return torch.tensor(labels) - def prepare_sentences(self, dataset: Dataset) -> List[str]: + def prepare_sentences(self, dataset: Dataset) -> list[str]: """Gather sentences in the text column.""" return dataset[self.text_column] @@ -160,7 +158,7 @@ def _downsample(dataset: Dataset, ratio: float) -> Dataset: @staticmethod def _find_text_and_label_columns( dataset: Dataset, text_column: Optional[str] = None, label_column: Optional[str] = None - ) -> Tuple[str, str, Type]: + ) -> tuple[str, str, Type]: """Find text and label columns in hf datasets based on common keywords""" text_columns = [ "text", "sentence", "token", "tweet", "document", "paragraph", "description", @@ -196,7 +194,7 @@ def _find_text_and_label_columns( @staticmethod def _merge_textpairs( dataset: Dataset, text_column: str, text_pair_column: str - ) -> Tuple[Dataset, str]: + ) -> tuple[Dataset, str]: """Concatenate text pairs into a single text using separator token""" new_text_column_name = text_column + "+" + text_pair_column @@ -206,7 +204,7 @@ def _merge_textpairs( f"Use one of the following names for tex pair: {dataset.column_names}." ) - def merge_texts(dataset_entry: Dict[str, str]) -> Dict[str, str]: + def merge_texts(dataset_entry: dict[str, str]) -> dict[str, str]: dataset_entry[text_column] = ( dataset_entry[text_column] + " [SEP] " + dataset_entry[text_pair_column] ) @@ -244,7 +242,7 @@ def pre_tokenize(example): example[text_column] = [token for token, _ in encoding] return example - dataset = dataset.map(pre_tokenize, num_proc=None, desc="Pre-tokenizing texts with Whitespace") + dataset = dataset.map(pre_tokenize, num_proc=None, desc="Whitespace pre-tokenization") return dataset @staticmethod @@ -262,9 +260,10 @@ def is_valid_entry(sample) -> bool: # Check if text is non-empty and does not contain emoji variation character '\uFE0F' has_text = bool(text) and (not isinstance(text, list) or "\uFE0F" not in text) - # Check if label is non-null and all elements are non-negative + # Check for empty and negative labels valid_label = label is not None and ( - all(l >= 0 for l in label) if isinstance(label, list) else label >= 0 + all(word_label >= 0 for word_label in label) if isinstance(label, list) + else label >= 0 ) return has_text and valid_label @@ -274,7 +273,7 @@ def is_valid_entry(sample) -> bool: @staticmethod def _make_labels_categorical( dataset: Dataset, label_column: str - ) -> Tuple[Dataset, Dict[str, int]]: + ) -> tuple[Dataset, dict[str, int]]: """Convert string labels to integers""" unique_labels = sorted(set(dataset[label_column])) label_map = {label: idx for idx, label in enumerate(unique_labels)} @@ -287,7 +286,7 @@ def map_labels(dataset_entry): return dataset, label_map @staticmethod - def _create_label_map(dataset: Dataset, label_column: str) -> Dict[str, int]: + def _create_label_map(dataset: Dataset, label_column: str) -> dict[str, int]: """Try to find feature names in a hf dataset.""" label_names = getattr( getattr(dataset.features[label_column], "feature", None), "names", None @@ -307,8 +306,8 @@ def _create_label_map(dataset: Dataset, label_column: str) -> Dict[str, int]: @staticmethod def _change_bio_encoding( - dataset: Dataset, label_column: str, label_map: Dict[str, int] - ) -> Tuple[Dataset, Dict[str, int]]: + dataset: Dataset, label_column: str, label_map: dict[str, int] + ) -> tuple[Dataset, dict[str, int]]: """Remove BIO prefixes from NER labels, update the dataset, and create a new label map.""" # Get unique labels without BIO prefixes and create new label map @@ -330,15 +329,16 @@ def _change_bio_encoding( if label_map == new_label_map: logger.warning( "Could not remove BIO prefixes for this tagging dataset. " - "Please add the label map as parameter label_map: Dict[str, int] = ... manually." + "Please add the label map as parameter label_map: dict[str, int] = ... manually." ) return dataset, new_label_map def log_dataset_info(self) -> None: """Log information about dataset""" - logger.info("Texts and labels: '%s', '%s'", self.text_column, self.label_column) - logger.info("Task category: '%s'", self.task_type) + logger.info(f"Texts and labels: {self.text_column}, {self.label_column}") + logger.info(f"Label map: {self.label_map}") is_downsampled = self.dataset_downsample and self.dataset_downsample < 1.0 downsample_info = f"(down-sampled to {self.dataset_downsample})" if is_downsampled else "" - logger.info("Dataset size: %s texts %s", self.dataset_size, downsample_info) + logger.info(f"Dataset size: {self.dataset_size} texts {downsample_info}") + logger.info(f"Task category: {self.task_type}") diff --git a/transformer_ranker/embedder.py b/transformer_ranker/embedder.py index 5db5fcd..bd224db 100644 --- a/transformer_ranker/embedder.py +++ b/transformer_ranker/embedder.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union +from typing import Optional, Union import torch from tokenizers.pre_tokenizers import Whitespace @@ -17,7 +17,7 @@ def __init__( sentence_pooling: Optional[str] = None, use_pretokenizer: bool = True, local_files_only: bool = False, - device: Optional[str] = None, + device: Optional[Union[str, torch.device]] = None ): """ Embed texts using a pre-trained transformer model. It's a word-level embedder, where @@ -71,11 +71,8 @@ def __init__( self.layer_pooling = layer_pooling self.sentence_pooling = sentence_pooling - # Set cpu or gpu device - if device is None: - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - else: - self.device = torch.device(device) + # Move model to device + self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu")) self.model = self.model.to(self.device) def tokenize(self, sentences): @@ -110,7 +107,7 @@ def embed( batch_size: int = 32, show_loading_bar: bool = True, move_embeddings_to_cpu: bool = True, - ) -> List[torch.Tensor]: + ) -> list[torch.Tensor]: """Split sentences into batches and embedd the full dataset""" if not isinstance(sentences, list): sentences = [sentences] @@ -129,10 +126,9 @@ def embed( return embeddings - def embed_batch(self, sentences, move_embeddings_to_cpu: bool = True) -> List[torch.Tensor]: - """Embeds a batch of sentences and returns a list of sentence embeddings that - consist of different numbers of word-level embeddings""" - # Tokenize with auto tokenizer + def embed_batch(self, sentences, move_embeddings_to_cpu: bool = True) -> list[torch.Tensor]: + """Embeds a batch of sentences and returns a list of sentence embeddings + (list of word embeddings). Embeddings can be moved to cpu or kept on gpu""" tokenized_input = self.tokenize(sentences) # Move inputs to gpu @@ -180,24 +176,23 @@ def embed_batch(self, sentences, move_embeddings_to_cpu: bool = True) -> List[to return sentence_embeddings - def _filter_layer_ids(self, layer_ids) -> List[int]: - """Transform a string with layer ids into a list of ints and - remove ids that are out of bound of the actual transformer size""" + def _filter_layer_ids(self, layer_ids: str) -> list[int]: + """Transform a string with layer ids into a list of ints. + Check if any ids are out-of-bounds of model size""" + num_layers = self.num_transformer_layers if layer_ids == "all": - return [-i for i in range(1, self.num_transformer_layers + 1)] - - layer_ids = [int(number) for number in layer_ids.split(",")] - layer_ids = [ - layer_id for layer_id in layer_ids if self.num_transformer_layers >= abs(layer_id) - ] + new_layer_ids = [-i for i in range(1, num_layers + 1)] + else: + new_layer_ids = [int(number) for number in layer_ids.split(",")] + new_layer_ids = [layer_id for layer_id in new_layer_ids if abs(layer_id) <= num_layers] - if not layer_ids: + if not new_layer_ids: raise ValueError( - f'"layer_ids" are out of bounds for the model size. ' - f"Num layers in model {self.model_name}: {self.num_transformer_layers}" + f'Given layer_ids are out of bounds for the model size. ' + f"Num layers in model {self.model_name}: {num_layers}" ) - return layer_ids + return new_layer_ids def _extract_relevant_layers(self, batched_embeddings: torch.Tensor) -> torch.Tensor: """Keep only relevant layers in each embedding and apply layer-wise pooling if required""" @@ -216,27 +211,26 @@ def _extract_relevant_layers(self, batched_embeddings: torch.Tensor) -> torch.Te return batched_embeddings - def _pool_subwords(self, sentence_embedding, sentence_word_ids) -> List[torch.Tensor]: + def _pool_subwords(self, sentence_embedding, sentence_word_ids) -> list[torch.Tensor]: """Pool sub-word embeddings into word embeddings for a single sentence. Subword pooling methods: 'first', 'last', 'mean'""" - word_embeddings: List[torch.Tensor] = [] - subword_embeddings: List[torch.Tensor] = [] + word_embeddings: list[torch.Tensor] = [] + subword_embeddings: list[torch.Tensor] = [] previous_word_id: int = 0 - # Gather word-level embeddings as lists of subwords + # Gather word-level embeddings as lists of sub-words for token_embedding, word_id in zip(sentence_embedding, sentence_word_ids): - # Append a word (stack all subwords into a word tensor) + # Stack all sub-words into a word tensor if previous_word_id != word_id and subword_embeddings: word_embeddings.append(torch.stack(subword_embeddings, dim=0)) subword_embeddings = [] - # Gather subword tokens into a single word if word_id is not None: subword_embeddings.append(token_embedding) previous_word_id = word_id - # Append the last word (some tokenizers don't have 'end of sequence' token) + # Add last word as some tokenizers don't have 'end of sequence' token if subword_embeddings: word_embeddings.append(torch.stack(subword_embeddings, dim=0)) @@ -270,7 +264,7 @@ def _pool_words(self, word_embeddings: torch.Tensor) -> torch.Tensor: if self.sentence_pooling == "last": sentence_embedding = word_embeddings[-1] - # Weight words by last word having the lowest importance: slightly better option for Causal LMs + # Weight words by last word having lower importance: can be better for Causal LMs if self.sentence_pooling == "weighted_mean": weights = torch.linspace(0.9, 0.1, steps=len(word_embeddings)) weights /= weights.sum() diff --git a/transformer_ranker/estimators/hscore.py b/transformer_ranker/estimators/hscore.py index 580fd5f..5ec3ccf 100644 --- a/transformer_ranker/estimators/hscore.py +++ b/transformer_ranker/estimators/hscore.py @@ -12,9 +12,9 @@ def __init__(self): def fit(self, embeddings: torch.Tensor, labels: torch.Tensor) -> float: """ - H-score intuition: Higher variance between embeddings of different classes (mean vectors for each class) - and lower feature redundancy (i.e. inverse of the covariance matrix for all data points) - lead to better transferability. + H-score intuition: Higher variance between embeddings of different classes + (mean vectors for each class) and lower feature redundancy (i.e. inverse of the covariance + matrix for all data points) lead to better transferability. :param embeddings: Embedding matrix of shape (num_samples, hidden_size) :param labels: Label vector of shape (num_samples,) @@ -33,11 +33,11 @@ def fit(self, embeddings: torch.Tensor, labels: torch.Tensor) -> float: cov_matrix = torch.mm(embeddings.T, embeddings) / num_samples # Compute beta and delta for the Ledoit-Wolf shrinkage - squared_features = embeddings ** 2 + squared_features = embeddings**2 emp_cov_trace = torch.sum(squared_features, dim=0) / num_samples mean_cov = torch.sum(emp_cov_trace) / hidden_size beta_ = torch.sum(torch.mm(squared_features.T, squared_features)) / num_samples - delta_ = torch.sum(cov_matrix ** 2) + delta_ = torch.sum(cov_matrix**2) beta = (beta_ - delta_) / (hidden_size * num_samples) delta = delta_ - 2.0 * mean_cov * emp_cov_trace.sum() + hidden_size * mean_cov**2 delta /= hidden_size @@ -51,7 +51,9 @@ def fit(self, embeddings: torch.Tensor, labels: torch.Tensor) -> float: pinv_covf_alpha = torch.linalg.pinv(covf_alpha, rcond=1e-15) # Matrix of class-conditioned means - class_means = torch.zeros(num_classes, hidden_size, dtype=torch.float64, device=embeddings.device) + class_means = torch.zeros( + num_classes, hidden_size, dtype=torch.float64, device=embeddings.device + ) for i, cls in enumerate(classes): mask = labels == cls class_features = embeddings[mask].mean(dim=0) diff --git a/transformer_ranker/estimators/logme.py b/transformer_ranker/estimators/logme.py index 06c86d7..e0de6c5 100644 --- a/transformer_ranker/estimators/logme.py +++ b/transformer_ranker/estimators/logme.py @@ -20,22 +20,27 @@ def fit( initial_alpha: float = 1.0, initial_beta: float = 1.0, max_iter: int = 11, - tol: float = 1e-3 + tol: float = 1e-3, ) -> float: """ - LogME intuition: estimate the evidence for embeddings by iteratively optimizing the prior (alpha) and - likelihood (beta), projecting the target labels onto the singular vectors of the feature matrix. + LogME intuition: estimate the evidence for embeddings by iteratively optimizing + the prior (alpha) and likelihood (beta), projecting the target labels onto the singular + vectors of the feature matrix. :param embeddings: Embedding matrix of shape (num_samples, hidden_dim) :param labels: Label vector of shape (num_samples,) - :param alpha: Initial precision of the prior (controls the regularization strength) - :param beta: Initial precision of the likelihood (controls the noise in the data) + :param initial_alpha: Initial precision of the prior (controls the regularization strength) + :param initial_beta: Initial precision of the likelihood (controls the noise in the data) :param tol: Tolerance for the optimization convergence :param max_iter: Maximum iterations to optimize alpha and beta :return: LogME score, where higher is better """ embeddings = embeddings.to(torch.float64) - labels = labels.to(torch.float64).unsqueeze(-1) if self.regression and labels.dim() == 1 else labels + labels = ( + labels.to(torch.float64).unsqueeze(-1) + if self.regression and labels.dim() == 1 + else labels + ) # Get the number of samples, number of classes, and the hidden size num_samples, hidden_size = embeddings.shape @@ -46,7 +51,7 @@ def fit( u, singular_values, v_transpose = torch.linalg.svd(embeddings, full_matrices=False) # Compute sigma which is the square of singular values - sigma = (singular_values.reshape(-1, 1) ** 2) + sigma = singular_values.reshape(-1, 1) ** 2 evidence_sum = 0.0 @@ -55,26 +60,32 @@ def fit( # Loop over each class (for classification) or each target column (for regression) for i in range(num_classes): - # For classification create a one-hot vector, for regression, use the corresponding column of labels - labels_ = labels[:, i] if self.regression else (labels == class_names[i]).to(torch.float64) + # Use one-hot vectors for classification and label columns for regression + labels_ = ( + labels[:, i] if self.regression else (labels == class_names[i]).to(torch.float64) + ) labels_ = labels_.unsqueeze(-1) # Project labels onto the singular vectors (x) projected_labels = u.T @ labels_ - projected_labels_squared = projected_labels ** 2 + projected_labels_squared = projected_labels**2 - # Compute residual sum of squares. If k < hidden_size, we compute sum of xi for 0 singular values directly - residual_sum_squares = (labels_ ** 2).sum() - projected_labels_squared.sum() + # Residual sum of squares; if k < hidden_size, sum xi for zero singular values + residual_sum_squares = (labels_**2).sum() - projected_labels_squared.sum() residual_error = torch.tensor(0.0) precision_weighted_sum = torch.tensor(0.0) # Iteratively update alpha and beta until convergence or maximum iterations for _ in range(max_iter): - tau = alpha / beta # Ratio of alpha to beta, representing the noise-to-signal ratio + tau = alpha / beta # Alpha-to-beta ratio, representing noise-to-signal gamma = (sigma / (sigma + tau)).sum() - precision_weighted_sum = (sigma * projected_labels_squared / ((tau + sigma) ** 2)).sum() - residual_error = (projected_labels_squared / ((1 + sigma / tau) ** 2)).sum() + residual_sum_squares + precision_weighted_sum = ( + sigma * projected_labels_squared / ((tau + sigma) ** 2) + ).sum() + residual_error = ( + projected_labels_squared / ((1 + sigma / tau) ** 2) + ).sum() + residual_sum_squares # Update alpha (prior precision) and beta (likelihood precision) alpha = gamma / (precision_weighted_sum + 1e-5) @@ -86,12 +97,14 @@ def fit( break # Compute evidence using optimized alpha and beta - evidence = (hidden_size / 2.0 * torch.log(alpha) - + num_samples / 2.0 * torch.log(beta) - - 0.5 * torch.sum(torch.log(alpha + beta * sigma)) - - beta / 2.0 * residual_error - - alpha / 2.0 * precision_weighted_sum - - num_samples / 2.0 * torch.log(torch.tensor(2 * torch.pi))) + evidence = ( + hidden_size / 2.0 * torch.log(alpha) + + num_samples / 2.0 * torch.log(beta) + - 0.5 * torch.sum(torch.log(alpha + beta * sigma)) + - beta / 2.0 * residual_error + - alpha / 2.0 * precision_weighted_sum + - num_samples / 2.0 * torch.log(torch.tensor(2 * torch.pi)) + ) evidence /= num_samples # Sum the evidence for each class diff --git a/transformer_ranker/estimators/nearestneighbors.py b/transformer_ranker/estimators/nearestneighbors.py index 31f4838..3eda18c 100644 --- a/transformer_ranker/estimators/nearestneighbors.py +++ b/transformer_ranker/estimators/nearestneighbors.py @@ -41,7 +41,7 @@ def fit(self, embeddings: torch.Tensor, labels: torch.Tensor, batch_size: int = # Exclude self-distances by setting diagonal to a large number diag_indices = torch.arange(start, end, device=embeddings.device) - dists[diag_indices - start, diag_indices] = float('inf') + dists[diag_indices - start, diag_indices] = float("inf") # Indices of the k nearest neighbors for the batch batch_knn_indices = dists.topk(self.k, largest=False).indices diff --git a/transformer_ranker/ranker.py b/transformer_ranker/ranker.py index 584d247..fc672fd 100644 --- a/transformer_ranker/ranker.py +++ b/transformer_ranker/ranker.py @@ -1,5 +1,5 @@ import logging -from typing import List, Optional, Union +from typing import Optional, Union import torch from datasets.dataset_dict import Dataset, DatasetDict @@ -23,33 +23,28 @@ def __init__( **kwargs, ): """ - Rank language models for different NLP tasks. Embed a part of the dataset and - estimate embedding suitability with transferability metrics like hscore or logme. - Embeddings can be averaged across all layers or selected from the best-performing layer. + Rank language models for various NLP tasks. Extract embeddings and evaluate + their suitability for a dataset using metrics like hscore or logme. + Embeddings can be averaged across all layers or selected from the best-suited layer. :param dataset: a dataset from huggingface, containing texts and label columns. :param dataset_downsample: a fraction to which the dataset should be reduced. :param kwargs: Additional dataset-specific parameters for data cleaning. """ - # Clean the original dataset and keep only needed columns - self.data_handler = DatasetCleaner( + self.data_cleaner = DatasetCleaner( dataset_downsample=dataset_downsample, text_column=text_column, label_column=label_column, **kwargs, ) - self.dataset = self.data_handler.prepare_dataset(dataset) - - self.task_type = self.data_handler.task_type - - # Find text and label columns - self.text_column = self.data_handler.text_column - self.label_column = self.data_handler.label_column + # Prepare dataset, identify task category + self.dataset = self.data_cleaner.prepare_dataset(dataset) + self.task_type = self.data_cleaner.task_type def run( self, - models: List[Union[str, torch.nn.Module]], + models: list[Union[str, torch.nn.Module]], batch_size: int = 32, estimator: str = "hscore", layer_aggregator: str = "layermean", @@ -59,25 +54,30 @@ def run( **kwargs, ): """ - Load models, get embeddings, score, and rank results. + Load models, get embeddings, score them, and rank results. :param models: A list of model names string identifiers :param batch_size: The number of samples to process in each batch, defaults to 32. - :param estimator: Transferability metric (e.g., 'hscore', 'logme', 'knn'). - :param layer_aggregator: Which layer to select (e.g., 'layermean', 'bestlayer'). - :param sentence_pooling: Embedder parameter for pooling words into a sentence embedding for - text classification tasks. Defaults to "mean" to average of all words. - :param device: Device for embedding, defaults to GPU if available ('cpu', 'cuda', 'cuda:2'). - :param gpu_estimation: Store and score embeddings on GPU for speedup. + :param estimator: Transferability metric: 'hscore', 'logme', 'knn' + :param layer_aggregator: Which layers to use 'layermean', 'bestlayer' + :param sentence_pooling: Pool words into a sentence embedding for text classification. + :param device: Device for language models ('cpu', 'cuda', 'cuda:2') + :param gpu_estimation: If to score embeddings on the same device (defaults to true) :param kwargs: Additional parameters for the embedder class (e.g. subword pooling) :return: Returns the sorted dictionary of model names and their scores """ self._confirm_ranker_setup(estimator=estimator, layer_aggregator=layer_aggregator) + # Set device for language model embeddings and log it + device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu")) + logger.info(f"Running on {device}") + # Load all transformers into hf cache self._preload_transformers(models, device) - labels = self.data_handler.prepare_labels(self.dataset) + # Prepare texts and labels from the dataset + texts = self.data_cleaner.prepare_sentences(self.dataset) + labels = self.data_cleaner.prepare_labels(self.dataset) ranking_results = Result(metric=estimator) @@ -102,7 +102,7 @@ def run( ) embeddings = embedder.embed( - self.data_handler.prepare_sentences(self.dataset), + sentences=texts, batch_size=batch_size, show_loading_bar=True, move_embeddings_to_cpu=not gpu_estimation, @@ -150,7 +150,7 @@ def run( zip(embedded_layer_ids, layer_scores) ) - # Aggregate layer scores + # Layer average gives one score, bestlayer uses max of scores final_score = max(layer_scores) if layer_aggregator == "bestlayer" else layer_scores[0] ranking_results.add_score(model_name, final_score) @@ -158,7 +158,7 @@ def run( result_log = f"{model_name} estimation: {final_score} ({ranking_results.metric})" if layer_aggregator == "bestlayer": - result_log += f", layerwise scores: {ranking_results.layerwise_scores[model_name]}" + result_log += f", layer scores: {ranking_results.layerwise_scores[model_name]}" logger.info(result_log) @@ -166,7 +166,8 @@ def run( @staticmethod def _preload_transformers( - models: List[Union[str, torch.nn.Module]], device: Optional[str] = None + models: list[Union[str, torch.nn.Module]], + device: torch.device, ) -> None: """Loads all models into HuggingFace cache""" cached_models, download_models = [], [] diff --git a/transformer_ranker/utils.py b/transformer_ranker/utils.py index 66b51ee..e0e7851 100644 --- a/transformer_ranker/utils.py +++ b/transformer_ranker/utils.py @@ -1,12 +1,11 @@ import logging import operator import warnings -from typing import Dict, List from transformers import logging as transformers_logging -def prepare_popular_models(model_size="base") -> List[str]: +def prepare_popular_models(model_size="base") -> list[str]: """Two lists of language models to try out""" base_models = [ # English models @@ -100,11 +99,11 @@ def __init__(self, metric: str): param metric: metric name (e.g. "hscore", or "logme") """ self.metric = metric - self._results: Dict[str, float] = {} - self.layerwise_scores: Dict[str, Dict[int, float]] = {} + self._results: dict[str, float] = {} + self.layerwise_scores: dict[str, dict[int, float]] = {} @property - def results(self) -> Dict[str, float]: + def results(self) -> dict[str, float]: """Return the result dictionary sorted by scores in descending order""" return dict(sorted(self._results.items(), key=lambda x: x[1], reverse=True)) @@ -115,12 +114,12 @@ def best_model(self) -> str: return model_name @property - def top_three(self) -> Dict[str, float]: + def top_three(self) -> dict[str, float]: """Return three highest scoring models""" return {k: self.results[k] for k in list(self.results.keys())[: min(3, len(self.results))]} @property - def best_layers(self) -> Dict[str, int]: + def best_layers(self) -> dict[str, int]: """Return a dictionary mapping each model name to its best layer ID.""" best_layers_dict = {} for model, values in self.layerwise_scores.items():