Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve logging #6

Merged
merged 3 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.8"]
python-version: ["3.9"]

steps:
- name: Checkout the repository
Expand Down
19 changes: 9 additions & 10 deletions examples/code_examples/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@
from transformer_ranker import TransformerRanker

# Load the 'conll2003' dataset
dataset = load_dataset('conll2003')
dataset = load_dataset("conll2003")

# Use smaller models to test on CPU
models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
# Use smaller models to run on CPU
models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker, set labels to chunk tags
ranker = TransformerRanker(dataset=dataset,
dataset_downsample=0.2,
label_column='chunk_tags')
ranker = TransformerRanker(dataset=dataset, dataset_downsample=0.2, label_column="chunk_tags")

# ... and run it
result = ranker.run(models=models, batch_size=64)
Expand Down
22 changes: 12 additions & 10 deletions examples/code_examples/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,23 @@
from transformer_ranker import TransformerRanker

# Load and inspect the 'trec' dataset
dataset = load_dataset('trec')
dataset = load_dataset("trec")
print(dataset)

# Use smaller models to run on CPU
language_models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
language_models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker
ranker = TransformerRanker(dataset=dataset,
dataset_downsample=0.2,
label_column="coarse_label",
)
ranker = TransformerRanker(
dataset=dataset,
dataset_downsample=0.2,
label_column="coarse_label",
)

# ... and run it
result = ranker.run(models=language_models, batch_size=32)
Expand Down
15 changes: 8 additions & 7 deletions examples/code_examples/entailment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
from transformer_ranker import TransformerRanker

# Load 'rte' Recognizing Textual Entailment dataset
entailment_dataset = load_dataset('glue', 'rte')
entailment_dataset = load_dataset("glue", "rte")

# Use smaller models to run on CPU
language_models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
language_models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker, set text_pair_column
# Initialize the ranker, set column for text pairs
ranker = TransformerRanker(dataset=entailment_dataset, text_pair_column="sentence2")

# ... and run it
Expand Down
6 changes: 3 additions & 3 deletions examples/code_examples/multiple_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
from transformer_ranker import TransformerRanker

# Load a dataset, initialize the ranker
dataset = load_dataset('trec')
dataset = load_dataset("trec")
ranker = TransformerRanker(dataset=dataset, dataset_downsample=0.2)

# Load smaller models
models = ['prajjwal1/bert-tiny', 'google/electra-small-discriminator']
models = ["prajjwal1/bert-tiny", "google/electra-small-discriminator"]

# ... and rank them using a large batch size
result = ranker.run(models=models, batch_size=124)
print(result)

# Add larger models
models = ['bert-large-cased', 'google/electra-large-discriminator']
models = ["bert-large-cased", "google/electra-large-discriminator"]

# ... and rank them using a small batch size
result.append(ranker.run(models=models, batch_size=16))
Expand Down
15 changes: 8 additions & 7 deletions examples/code_examples/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
from transformer_ranker import TransformerRanker

# Load a regression dataset
regression_dataset = load_dataset('glue', 'stsb')
regression_dataset = load_dataset("glue", "stsb")

# You can test on cpu using smaller models
models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
# Use smaller models to run on CPU
models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker, set the text pair column
ranker = TransformerRanker(dataset=regression_dataset, text_pair_column="sentence2")
Expand Down
17 changes: 8 additions & 9 deletions examples/code_examples/tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@
from transformer_ranker import TransformerRanker

# Load the WNUT-17 NER dataset of English tweets
dataset_ner = load_dataset('leondz/wnut_17')
dataset_ner = load_dataset("leondz/wnut_17")

# Use smaller models to test on CPU
models = ['prajjwal1/bert-tiny',
'google/electra-small-discriminator',
'microsoft/deberta-v3-small',
'bert-base-uncased',
]
models = [
"prajjwal1/bert-tiny",
"google/electra-small-discriminator",
"microsoft/deberta-v3-small",
"bert-base-uncased",
]

# Initialize the ranker, set labels to ner tags
ranker = TransformerRanker(dataset=dataset_ner,
dataset_downsample=0.2,
label_column='ner_tags')
ranker = TransformerRanker(dataset=dataset_ner, dataset_downsample=0.2, label_column="ner_tags")

# ... and run it
result = ranker.run(models=models, batch_size=64)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ def read_requirements():
url="https://github.com/flairNLP/transformer-ranker",
install_requires=read_requirements(),
license='MIT',
python_requires=">=3.8",
python_requires=">=3.9",
)
50 changes: 25 additions & 25 deletions transformer_ranker/datacleaner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Dict, List, Optional, Tuple, Type, Union
from typing import Optional, Type, Union

import datasets
import torch
Expand All @@ -22,7 +22,7 @@ def __init__(
task_type: Optional[str] = None,
text_column: Optional[str] = None,
label_column: Optional[str] = None,
label_map: Optional[Dict[str, int]] = None,
label_map: Optional[dict[str, int]] = None,
text_pair_column: Optional[str] = None,
):
"""
Expand All @@ -34,7 +34,7 @@ def __init__(
:param change_bio_encoding: Convert BIO to single-class labels, removing B-, I-, O- prefix.
:param remove_empty_sentences: Whether to remove empty sentences.
:param dataset_downsample: Fraction to reduce the dataset size.
:param task_type: Task category "token classification", "text classification", "text regression".
:param task_type: "token classification", "text classification", or "text regression".
:param text_column: Column name for texts.
:param label_column: Column name for labels.
:param label_map: A dictionary which maps label names to integers.
Expand Down Expand Up @@ -107,7 +107,7 @@ def prepare_dataset(
)

# Convert string labels to integers
if label_type == str:
if isinstance(label_type, str):
dataset, self.label_map = self._make_labels_categorical(dataset, label_column)

# Try to find label map in the dataset
Expand All @@ -120,7 +120,10 @@ def prepare_dataset(
dataset, label_column, self.label_map
)

logger.info("Label map: %s", self.label_map)
# Keep only text and label columns
keep_columns = {text_column, self.text_pair_column, label_column} - {None}
columns_to_remove = list(set(dataset.column_names) - keep_columns)
dataset = dataset.remove_columns(columns_to_remove)

# Set updated attributes and log them
self.text_column = text_column
Expand All @@ -129,11 +132,6 @@ def prepare_dataset(
self.dataset_size = len(dataset)
self.log_dataset_info()

# Keep only text and label columns
keep_columns = {self.text_column, self.text_pair_column, self.label_column} - {None}
columns_to_remove = list(set(dataset.column_names) - keep_columns)
dataset = dataset.remove_columns(columns_to_remove)

return dataset

def prepare_labels(self, dataset: Dataset) -> torch.Tensor:
Expand All @@ -147,7 +145,7 @@ def prepare_labels(self, dataset: Dataset) -> torch.Tensor:
)
return torch.tensor(labels)

def prepare_sentences(self, dataset: Dataset) -> List[str]:
def prepare_sentences(self, dataset: Dataset) -> list[str]:
"""Gather sentences in the text column."""
return dataset[self.text_column]

Expand All @@ -160,7 +158,7 @@ def _downsample(dataset: Dataset, ratio: float) -> Dataset:
@staticmethod
def _find_text_and_label_columns(
dataset: Dataset, text_column: Optional[str] = None, label_column: Optional[str] = None
) -> Tuple[str, str, Type]:
) -> tuple[str, str, Type]:
"""Find text and label columns in hf datasets based on common keywords"""
text_columns = [
"text", "sentence", "token", "tweet", "document", "paragraph", "description",
Expand Down Expand Up @@ -196,7 +194,7 @@ def _find_text_and_label_columns(
@staticmethod
def _merge_textpairs(
dataset: Dataset, text_column: str, text_pair_column: str
) -> Tuple[Dataset, str]:
) -> tuple[Dataset, str]:
"""Concatenate text pairs into a single text using separator token"""
new_text_column_name = text_column + "+" + text_pair_column

Expand All @@ -206,7 +204,7 @@ def _merge_textpairs(
f"Use one of the following names for tex pair: {dataset.column_names}."
)

def merge_texts(dataset_entry: Dict[str, str]) -> Dict[str, str]:
def merge_texts(dataset_entry: dict[str, str]) -> dict[str, str]:
dataset_entry[text_column] = (
dataset_entry[text_column] + " [SEP] " + dataset_entry[text_pair_column]
)
Expand Down Expand Up @@ -244,7 +242,7 @@ def pre_tokenize(example):
example[text_column] = [token for token, _ in encoding]
return example

dataset = dataset.map(pre_tokenize, num_proc=None, desc="Pre-tokenizing texts with Whitespace")
dataset = dataset.map(pre_tokenize, num_proc=None, desc="Whitespace pre-tokenization")
return dataset

@staticmethod
Expand All @@ -262,9 +260,10 @@ def is_valid_entry(sample) -> bool:
# Check if text is non-empty and does not contain emoji variation character '\uFE0F'
has_text = bool(text) and (not isinstance(text, list) or "\uFE0F" not in text)

# Check if label is non-null and all elements are non-negative
# Check for empty and negative labels
valid_label = label is not None and (
all(l >= 0 for l in label) if isinstance(label, list) else label >= 0
all(word_label >= 0 for word_label in label) if isinstance(label, list)
else label >= 0
)

return has_text and valid_label
Expand All @@ -274,7 +273,7 @@ def is_valid_entry(sample) -> bool:
@staticmethod
def _make_labels_categorical(
dataset: Dataset, label_column: str
) -> Tuple[Dataset, Dict[str, int]]:
) -> tuple[Dataset, dict[str, int]]:
"""Convert string labels to integers"""
unique_labels = sorted(set(dataset[label_column]))
label_map = {label: idx for idx, label in enumerate(unique_labels)}
Expand All @@ -287,7 +286,7 @@ def map_labels(dataset_entry):
return dataset, label_map

@staticmethod
def _create_label_map(dataset: Dataset, label_column: str) -> Dict[str, int]:
def _create_label_map(dataset: Dataset, label_column: str) -> dict[str, int]:
"""Try to find feature names in a hf dataset."""
label_names = getattr(
getattr(dataset.features[label_column], "feature", None), "names", None
Expand All @@ -307,8 +306,8 @@ def _create_label_map(dataset: Dataset, label_column: str) -> Dict[str, int]:

@staticmethod
def _change_bio_encoding(
dataset: Dataset, label_column: str, label_map: Dict[str, int]
) -> Tuple[Dataset, Dict[str, int]]:
dataset: Dataset, label_column: str, label_map: dict[str, int]
) -> tuple[Dataset, dict[str, int]]:
"""Remove BIO prefixes from NER labels, update the dataset, and create a new label map."""

# Get unique labels without BIO prefixes and create new label map
Expand All @@ -330,15 +329,16 @@ def _change_bio_encoding(
if label_map == new_label_map:
logger.warning(
"Could not remove BIO prefixes for this tagging dataset. "
"Please add the label map as parameter label_map: Dict[str, int] = ... manually."
"Please add the label map as parameter label_map: dict[str, int] = ... manually."
)

return dataset, new_label_map

def log_dataset_info(self) -> None:
"""Log information about dataset"""
logger.info("Texts and labels: '%s', '%s'", self.text_column, self.label_column)
logger.info("Task category: '%s'", self.task_type)
logger.info(f"Texts and labels: {self.text_column}, {self.label_column}")
logger.info(f"Label map: {self.label_map}")
is_downsampled = self.dataset_downsample and self.dataset_downsample < 1.0
downsample_info = f"(down-sampled to {self.dataset_downsample})" if is_downsampled else ""
logger.info("Dataset size: %s texts %s", self.dataset_size, downsample_info)
logger.info(f"Dataset size: {self.dataset_size} texts {downsample_info}")
logger.info(f"Task category: {self.task_type}")
Loading