From 02539ccf2aa0a89d9f997696a5257cd75ce6d5a3 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 11 Mar 2024 12:15:01 +0100 Subject: [PATCH] Ruff update. --- bindings/python/examples/example.py | 8 +- .../python/examples/train_with_datasets.py | 2 +- bindings/python/py_src/tokenizers/__init__.py | 46 +++--- .../python/py_src/tokenizers/__init__.pyi | 131 +++++++++++++++++- .../py_src/tokenizers/decoders/__init__.pyi | 20 +++ .../py_src/tokenizers/models/__init__.pyi | 34 +++++ .../tokenizers/normalizers/__init__.pyi | 47 ++++++- .../tokenizers/pre_tokenizers/__init__.pyi | 34 +++++ .../py_src/tokenizers/processors/__init__.pyi | 16 +++ .../py_src/tokenizers/tools/visualizer.py | 8 +- .../py_src/tokenizers/trainers/__init__.pyi | 4 + bindings/python/pyproject.toml | 12 ++ bindings/python/stub.py | 8 +- .../python/tests/bindings/test_encoding.py | 1 - bindings/python/tests/bindings/test_models.py | 1 - .../python/tests/bindings/test_normalizers.py | 3 +- .../python/tests/bindings/test_processors.py | 9 +- .../python/tests/bindings/test_tokenizer.py | 7 +- .../python/tests/bindings/test_trainers.py | 1 - .../tests/documentation/test_pipeline.py | 1 - .../tests/documentation/test_quicktour.py | 4 - .../test_tutorial_train_from_iterators.py | 2 +- .../implementations/test_base_tokenizer.py | 1 - .../implementations/test_bert_wordpiece.py | 3 +- .../implementations/test_byte_level_bpe.py | 3 +- .../tests/implementations/test_char_bpe.py | 3 +- .../implementations/test_sentencepiece.py | 1 - bindings/python/tests/test_serialization.py | 7 +- 28 files changed, 341 insertions(+), 76 deletions(-) diff --git a/bindings/python/examples/example.py b/bindings/python/examples/example.py index 97b903401..c09d4c128 100644 --- a/bindings/python/examples/example.py +++ b/bindings/python/examples/example.py @@ -4,16 +4,16 @@ from tqdm import tqdm - -logging.getLogger("transformers").disabled = True -logging.getLogger("transformers.tokenization_utils").disabled = True - from tokenizers import Tokenizer, decoders, pre_tokenizers from tokenizers.models import BPE, WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.processors import BertProcessing from transformers import BertTokenizer, GPT2Tokenizer +logging.getLogger("transformers").disabled = True +logging.getLogger("transformers.tokenization_utils").disabled = True + + parser = argparse.ArgumentParser() parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)") diff --git a/bindings/python/examples/train_with_datasets.py b/bindings/python/examples/train_with_datasets.py index 7c3168342..b54376a2a 100644 --- a/bindings/python/examples/train_with_datasets.py +++ b/bindings/python/examples/train_with_datasets.py @@ -1,6 +1,6 @@ import datasets -from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers +from tokenizers import Tokenizer, models, normalizers, pre_tokenizers # Build a tokenizer diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index efd574298..313593034 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -1,5 +1,28 @@ from enum import Enum from typing import List, Tuple, Union +from .tokenizers import ( + AddedToken, + Encoding, + NormalizedString, + PreTokenizedString, + Regex, + Token, + Tokenizer, + decoders, + models, + normalizers, + pre_tokenizers, + processors, + trainers, + __version__, +) +from .implementations import ( + BertWordPieceTokenizer, + ByteLevelBPETokenizer, + CharBPETokenizer, + SentencePieceBPETokenizer, + SentencePieceUnigramTokenizer, +) Offsets = Tuple[int, int] @@ -75,26 +98,3 @@ class SplitDelimiterBehavior(Enum): CONTIGUOUS = "contiguous" -from .tokenizers import ( - AddedToken, - Encoding, - NormalizedString, - PreTokenizedString, - Regex, - Token, - Tokenizer, - decoders, - models, - normalizers, - pre_tokenizers, - processors, - trainers, - __version__, -) -from .implementations import ( - BertWordPieceTokenizer, - ByteLevelBPETokenizer, - CharBPETokenizer, - SentencePieceBPETokenizer, - SentencePieceUnigramTokenizer, -) diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 4b80a7f75..9a8c500ce 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -35,38 +35,52 @@ class AddedToken: """ - def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False): + def __init__( + self, + content, + single_word=False, + lstrip=False, + rstrip=False, + normalized=True, + special=False, + ): pass + @property def content(self): """ Get the content of this :obj:`AddedToken` """ pass + @property def lstrip(self): """ Get the value of the :obj:`lstrip` option """ pass + @property def normalized(self): """ Get the value of the :obj:`normalized` option """ pass + @property def rstrip(self): """ Get the value of the :obj:`rstrip` option """ pass + @property def single_word(self): """ Get the value of the :obj:`single_word` option """ pass + @property def special(self): """ @@ -74,6 +88,7 @@ class AddedToken: """ pass + class Encoding: """ The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`. @@ -92,6 +107,7 @@ class Encoding: :obj:`List[int]`: The attention mask """ pass + def char_to_token(self, char_pos, sequence_index=0): """ Get the token that contains the char at the given position in the input sequence. @@ -106,6 +122,7 @@ class Encoding: :obj:`int`: The index of the token that contains this char in the encoded sequence """ pass + def char_to_word(self, char_pos, sequence_index=0): """ Get the word that contains the char at the given position in the input sequence. @@ -120,6 +137,7 @@ class Encoding: :obj:`int`: The index of the word that contains this char in the input sequence """ pass + @property def ids(self): """ @@ -132,6 +150,7 @@ class Encoding: :obj:`List[int]`: The list of IDs """ pass + @staticmethod def merge(encodings, growing_offsets=True): """ @@ -148,6 +167,7 @@ class Encoding: :class:`~tokenizers.Encoding`: The resulting Encoding """ pass + @property def n_sequences(self): """ @@ -157,6 +177,7 @@ class Encoding: :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding` """ pass + @property def offsets(self): """ @@ -169,6 +190,7 @@ class Encoding: A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets """ pass + @property def overflowing(self): """ @@ -183,7 +205,10 @@ class Encoding: maximum length. """ pass - def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"): + + def pad( + self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]" + ): """ Pad the :class:`~tokenizers.Encoding` at the given length @@ -204,6 +229,7 @@ class Encoding: The pad token to use """ pass + @property def sequence_ids(self): """ @@ -217,6 +243,7 @@ class Encoding: A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index. """ pass + def set_sequence_id(self, sequence_id): """ Set the given sequence index @@ -225,6 +252,7 @@ class Encoding: :class:`~tokenizers.Encoding`. """ pass + @property def special_tokens_mask(self): """ @@ -236,6 +264,7 @@ class Encoding: :obj:`List[int]`: The special tokens mask """ pass + def token_to_chars(self, token_index): """ Get the offsets of the token at the given index. @@ -252,6 +281,7 @@ class Encoding: :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` """ pass + def token_to_sequence(self, token_index): """ Get the index of the sequence represented by the given token. @@ -267,6 +297,7 @@ class Encoding: :obj:`int`: The sequence id of the given token """ pass + def token_to_word(self, token_index): """ Get the index of the word that contains the token in one of the input sequences. @@ -283,6 +314,7 @@ class Encoding: :obj:`int`: The index of the word in the relevant input sequence. """ pass + @property def tokens(self): """ @@ -294,6 +326,7 @@ class Encoding: :obj:`List[str]`: The list of tokens """ pass + def truncate(self, max_length, stride=0, direction="right"): """ Truncate the :class:`~tokenizers.Encoding` at the given length @@ -312,6 +345,7 @@ class Encoding: Truncate direction """ pass + @property def type_ids(self): """ @@ -324,6 +358,7 @@ class Encoding: :obj:`List[int]`: The list of type ids """ pass + @property def word_ids(self): """ @@ -341,6 +376,7 @@ class Encoding: A :obj:`List` of :obj:`Optional[int]`: A list of optional word index. """ pass + def word_to_chars(self, word_index, sequence_index=0): """ Get the offsets of the word at the given index in one of the input sequences. @@ -355,6 +391,7 @@ class Encoding: :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` """ pass + def word_to_tokens(self, word_index, sequence_index=0): """ Get the encoded tokens corresponding to the word at the given index @@ -370,6 +407,7 @@ class Encoding: :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` """ pass + @property def words(self): """ @@ -392,6 +430,7 @@ class Encoding: """ pass + class NormalizedString: """ NormalizedString @@ -410,31 +449,37 @@ class NormalizedString: Append the given sequence to the string """ pass + def clear(self): """ Clears the string """ pass + def filter(self, func): """ Filter each character of the string using the given func """ pass + def for_each(self, func): """ Calls the given function for each character of the string """ pass + def lowercase(self): """ Lowercase the string """ pass + def lstrip(self): """ Strip the left of the string """ pass + def map(self, func): """ Calls the given function for each character of the string @@ -443,37 +488,44 @@ class NormalizedString: returned value **must** be a str of length 1 (ie a character). """ pass + def nfc(self): """ Runs the NFC normalization """ pass + def nfd(self): """ Runs the NFD normalization """ pass + def nfkc(self): """ Runs the NFKC normalization """ pass + def nfkd(self): """ Runs the NFKD normalization """ pass + @property def normalized(self): """ The normalized part of the string """ pass + def prepend(self, s): """ Prepend the given sequence to the string """ pass + def replace(self, pattern, content): """ Replace the content of the given pattern with the provided content @@ -486,16 +538,19 @@ class NormalizedString: The content to be used as replacement """ pass + def rstrip(self): """ Strip the right of the string """ pass + def slice(self, range): """ Slice the string using the given range """ pass + def split(self, pattern, behavior): """ Split the NormalizedString using the given pattern and the specified behavior @@ -513,17 +568,20 @@ class NormalizedString: A list of NormalizedString, representing each split """ pass + def strip(self): """ Strip both ends of the string """ pass + def uppercase(self): """ Uppercase the string """ pass + class PreTokenizedString: """ PreTokenizedString @@ -545,6 +603,7 @@ class PreTokenizedString: def __init__(self, sequence): pass + def get_splits(self, offset_referential="original", offset_type="char"): """ Get the splits currently managed by the PreTokenizedString @@ -565,6 +624,7 @@ class PreTokenizedString: A list of splits """ pass + def normalize(self, func): """ Normalize each split of the `PreTokenizedString` using the given `func` @@ -576,6 +636,7 @@ class PreTokenizedString: NormalizedString allow its modification. """ pass + def split(self, func): """ Split the PreTokenizedString using the given `func` @@ -590,6 +651,7 @@ class PreTokenizedString: should come from calling either `.split` or `.slice` on the received one. """ pass + def to_encoding(self, type_id=0, word_idx=None): """ Return an Encoding generated from this PreTokenizedString @@ -607,6 +669,7 @@ class PreTokenizedString: An Encoding """ pass + def tokenize(self, func): """ Tokenize each split of the `PreTokenizedString` using the given `func` @@ -618,6 +681,7 @@ class PreTokenizedString: """ pass + class Regex: """ Instantiate a new Regex with the given pattern @@ -626,9 +690,11 @@ class Regex: def __init__(self, pattern): pass + class Token: pass + class Tokenizer: """ A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input @@ -642,6 +708,7 @@ class Tokenizer: def __init__(self, model): pass + def add_special_tokens(self, tokens): """ Add the given special tokens to the Tokenizer. @@ -662,6 +729,7 @@ class Tokenizer: :obj:`int`: The number of tokens that were created in the vocabulary """ pass + def add_tokens(self, tokens): """ Add the given tokens to the vocabulary @@ -678,6 +746,7 @@ class Tokenizer: :obj:`int`: The number of tokens that were created in the vocabulary """ pass + def decode(self, ids, skip_special_tokens=True): """ Decode the given list of ids back to a string @@ -695,6 +764,7 @@ class Tokenizer: :obj:`str`: The decoded string """ pass + def decode_batch(self, sequences, skip_special_tokens=True): """ Decode a batch of ids back to their corresponding string @@ -710,14 +780,22 @@ class Tokenizer: :obj:`List[str]`: A list of decoded strings """ pass + @property def decoder(self): """ The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer """ pass + def enable_padding( - self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None + self, + direction="right", + pad_id=0, + pad_type_id=0, + pad_token="[PAD]", + length=None, + pad_to_multiple_of=None, ): """ Enable the padding @@ -745,7 +823,10 @@ class Tokenizer: the longest sequence in a batch. """ pass - def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"): + + def enable_truncation( + self, max_length, stride=0, strategy="longest_first", direction="right" + ): """ Enable truncation @@ -765,7 +846,10 @@ class Tokenizer: Truncate direction """ pass - def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True): + + def encode( + self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True + ): """ Encode the given sequence and pair. This method can process raw text sequences as well as already pre-tokenized sequences. @@ -803,6 +887,7 @@ class Tokenizer: """ pass + def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True): """ Encode the given batch of inputs. This method accept both raw text sequences @@ -838,6 +923,20 @@ class Tokenizer: """ pass + + @property + def encode_special_tokens(self): + """ + Modifies the tokenizer in order to use or not the special tokens + during encoding. + + Args: + value (:obj:`bool`): + Whether to use the special tokens or not + + """ + pass + @staticmethod def from_buffer(buffer): """ @@ -851,6 +950,7 @@ class Tokenizer: :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass + @staticmethod def from_file(path): """ @@ -865,6 +965,7 @@ class Tokenizer: :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass + @staticmethod def from_pretrained(identifier, revision="main", auth_token=None): """ @@ -885,6 +986,7 @@ class Tokenizer: :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass + @staticmethod def from_str(json): """ @@ -899,6 +1001,7 @@ class Tokenizer: :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass + def get_added_tokens_decoder(self): """ Get the underlying vocabulary @@ -907,6 +1010,7 @@ class Tokenizer: :obj:`Dict[int, AddedToken]`: The vocabulary """ pass + def get_vocab(self, with_added_tokens=True): """ Get the underlying vocabulary @@ -919,6 +1023,7 @@ class Tokenizer: :obj:`Dict[str, int]`: The vocabulary """ pass + def get_vocab_size(self, with_added_tokens=True): """ Get the size of the underlying vocabulary @@ -931,6 +1036,7 @@ class Tokenizer: :obj:`int`: The size of the vocabulary """ pass + def id_to_token(self, id): """ Convert the given id to its corresponding token if it exists @@ -943,28 +1049,33 @@ class Tokenizer: :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary """ pass + @property def model(self): """ The :class:`~tokenizers.models.Model` in use by the Tokenizer """ pass + def no_padding(self): """ Disable padding """ pass + def no_truncation(self): """ Disable truncation """ pass + @property def normalizer(self): """ The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer """ pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -972,6 +1083,7 @@ class Tokenizer: :return: """ pass + @property def padding(self): """ @@ -984,6 +1096,7 @@ class Tokenizer: A dict with the current padding parameters if padding is enabled """ pass + def post_process(self, encoding, pair=None, add_special_tokens=True): """ Apply all the post-processing steps to the given encodings. @@ -1010,18 +1123,21 @@ class Tokenizer: :class:`~tokenizers.Encoding`: The final post-processed encoding """ pass + @property def post_processor(self): """ The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer """ pass + @property def pre_tokenizer(self): """ The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer """ pass + def save(self, path, pretty=True): """ Save the :class:`~tokenizers.Tokenizer` to the file at the given path. @@ -1034,6 +1150,7 @@ class Tokenizer: Whether the JSON file should be pretty formatted. """ pass + def to_str(self, pretty=False): """ Gets a serialized string representing this :class:`~tokenizers.Tokenizer`. @@ -1046,6 +1163,7 @@ class Tokenizer: :obj:`str`: A string representing the serialized Tokenizer """ pass + def token_to_id(self, token): """ Convert the given token to its corresponding id if it exists @@ -1058,6 +1176,7 @@ class Tokenizer: :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary """ pass + def train(self, files, trainer=None): """ Train the Tokenizer using the given files. @@ -1074,6 +1193,7 @@ class Tokenizer: An optional trainer that should be used to train our Model """ pass + def train_from_iterator(self, iterator, trainer=None, length=None): """ Train the Tokenizer using the provided iterator. @@ -1097,6 +1217,7 @@ class Tokenizer: provide meaningful progress tracking """ pass + @property def truncation(self): """ diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 83a0e827d..2a8692f82 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -20,6 +20,7 @@ class Decoder: """ pass + class BPEDecoder(Decoder): """ BPEDecoder Decoder @@ -32,6 +33,7 @@ class BPEDecoder(Decoder): def __init__(self, suffix=""): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -45,6 +47,7 @@ class BPEDecoder(Decoder): """ pass + class ByteFallback(Decoder): """ ByteFallback Decoder @@ -56,6 +59,7 @@ class ByteFallback(Decoder): def __init__(self): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -69,6 +73,7 @@ class ByteFallback(Decoder): """ pass + class ByteLevel(Decoder): """ ByteLevel Decoder @@ -79,6 +84,7 @@ class ByteLevel(Decoder): def __init__(self): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -92,6 +98,7 @@ class ByteLevel(Decoder): """ pass + class CTC(Decoder): """ CTC Decoder @@ -108,6 +115,7 @@ class CTC(Decoder): def __init__(self, pad_token="", word_delimiter_token="|", cleanup=True): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -121,6 +129,7 @@ class CTC(Decoder): """ pass + class Fuse(Decoder): """ Fuse Decoder @@ -131,6 +140,7 @@ class Fuse(Decoder): def __init__(self): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -144,6 +154,7 @@ class Fuse(Decoder): """ pass + class Metaspace(Decoder): """ Metaspace Decoder @@ -160,6 +171,7 @@ class Metaspace(Decoder): def __init__(self, replacement="▁", add_prefix_space=True): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -173,6 +185,7 @@ class Metaspace(Decoder): """ pass + class Replace(Decoder): """ Replace Decoder @@ -183,6 +196,7 @@ class Replace(Decoder): def __init__(self, pattern, content): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -196,6 +210,7 @@ class Replace(Decoder): """ pass + class Sequence(Decoder): """ Sequence Decoder @@ -207,6 +222,7 @@ class Sequence(Decoder): def __init__(self, decoders): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -220,6 +236,7 @@ class Sequence(Decoder): """ pass + class Strip(Decoder): """ Strip normalizer @@ -228,6 +245,7 @@ class Strip(Decoder): def __init__(self, content, left=0, right=0): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -241,6 +259,7 @@ class Strip(Decoder): """ pass + class WordPiece(Decoder): """ WordPiece Decoder @@ -256,6 +275,7 @@ class WordPiece(Decoder): def __init__(self, prefix="##", cleanup=True): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi index 0218f8e56..9c747e611 100644 --- a/bindings/python/py_src/tokenizers/models/__init__.pyi +++ b/bindings/python/py_src/tokenizers/models/__init__.pyi @@ -20,6 +20,7 @@ class Model: :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -32,6 +33,7 @@ class Model: :obj:`str`: The token associated to the ID """ pass + def save(self, folder, prefix): """ Save the current model @@ -51,6 +53,7 @@ class Model: :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -63,6 +66,7 @@ class Model: :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence @@ -76,6 +80,7 @@ class Model: """ pass + class BPE(Model): """ An implementation of the BPE (Byte-Pair Encoding) algorithm @@ -124,6 +129,7 @@ class BPE(Model): byte_fallback=False, ): pass + @staticmethod def from_file(cls, vocab, merge, **kwargs): """ @@ -149,6 +155,7 @@ class BPE(Model): :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files """ pass + def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -160,6 +167,7 @@ class BPE(Model): :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -172,6 +180,7 @@ class BPE(Model): :obj:`str`: The token associated to the ID """ pass + @staticmethod def read_file(self, vocab, merges): """ @@ -193,6 +202,7 @@ class BPE(Model): The vocabulary and merges loaded into memory """ pass + def save(self, folder, prefix): """ Save the current model @@ -212,6 +222,7 @@ class BPE(Model): :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -224,6 +235,7 @@ class BPE(Model): :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence @@ -237,6 +249,7 @@ class BPE(Model): """ pass + class Unigram(Model): """ An implementation of the Unigram algorithm @@ -248,6 +261,7 @@ class Unigram(Model): def __init__(self, vocab, unk_id, byte_fallback): pass + def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -259,6 +273,7 @@ class Unigram(Model): :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -271,6 +286,7 @@ class Unigram(Model): :obj:`str`: The token associated to the ID """ pass + def save(self, folder, prefix): """ Save the current model @@ -290,6 +306,7 @@ class Unigram(Model): :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -302,6 +319,7 @@ class Unigram(Model): :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence @@ -315,6 +333,7 @@ class Unigram(Model): """ pass + class WordLevel(Model): """ An implementation of the WordLevel algorithm @@ -331,6 +350,7 @@ class WordLevel(Model): def __init__(self, vocab, unk_token): pass + @staticmethod def from_file(vocab, unk_token): """ @@ -353,6 +373,7 @@ class WordLevel(Model): :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file """ pass + def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -364,6 +385,7 @@ class WordLevel(Model): :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -376,6 +398,7 @@ class WordLevel(Model): :obj:`str`: The token associated to the ID """ pass + @staticmethod def read_file(vocab): """ @@ -393,6 +416,7 @@ class WordLevel(Model): :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` """ pass + def save(self, folder, prefix): """ Save the current model @@ -412,6 +436,7 @@ class WordLevel(Model): :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -424,6 +449,7 @@ class WordLevel(Model): :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence @@ -437,6 +463,7 @@ class WordLevel(Model): """ pass + class WordPiece(Model): """ An implementation of the WordPiece algorithm @@ -454,6 +481,7 @@ class WordPiece(Model): def __init__(self, vocab, unk_token, max_input_chars_per_word): pass + @staticmethod def from_file(vocab, **kwargs): """ @@ -476,6 +504,7 @@ class WordPiece(Model): :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file """ pass + def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -487,6 +516,7 @@ class WordPiece(Model): :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -499,6 +529,7 @@ class WordPiece(Model): :obj:`str`: The token associated to the ID """ pass + @staticmethod def read_file(vocab): """ @@ -517,6 +548,7 @@ class WordPiece(Model): :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` """ pass + def save(self, folder, prefix): """ Save the current model @@ -536,6 +568,7 @@ class WordPiece(Model): :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -548,6 +581,7 @@ class WordPiece(Model): :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi index 09c2d8397..996c255ca 100644 --- a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi @@ -22,6 +22,7 @@ class Normalizer: :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -40,6 +41,7 @@ class Normalizer: """ pass + class BertNormalizer(Normalizer): """ BertNormalizer @@ -63,8 +65,15 @@ class BertNormalizer(Normalizer): Whether to lowercase. """ - def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True): + def __init__( + self, + clean_text=True, + handle_chinese_chars=True, + strip_accents=None, + lowercase=True, + ): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -80,6 +89,7 @@ class BertNormalizer(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -98,6 +108,7 @@ class BertNormalizer(Normalizer): """ pass + class Lowercase(Normalizer): """ Lowercase Normalizer @@ -105,6 +116,7 @@ class Lowercase(Normalizer): def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -120,6 +132,7 @@ class Lowercase(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -138,6 +151,7 @@ class Lowercase(Normalizer): """ pass + class NFC(Normalizer): """ NFC Unicode Normalizer @@ -145,6 +159,7 @@ class NFC(Normalizer): def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -160,6 +175,7 @@ class NFC(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -178,6 +194,7 @@ class NFC(Normalizer): """ pass + class NFD(Normalizer): """ NFD Unicode Normalizer @@ -185,6 +202,7 @@ class NFD(Normalizer): def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -200,6 +218,7 @@ class NFD(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -218,6 +237,7 @@ class NFD(Normalizer): """ pass + class NFKC(Normalizer): """ NFKC Unicode Normalizer @@ -225,6 +245,7 @@ class NFKC(Normalizer): def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -240,6 +261,7 @@ class NFKC(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -258,6 +280,7 @@ class NFKC(Normalizer): """ pass + class NFKD(Normalizer): """ NFKD Unicode Normalizer @@ -265,6 +288,7 @@ class NFKD(Normalizer): def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -280,6 +304,7 @@ class NFKD(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -298,6 +323,7 @@ class NFKD(Normalizer): """ pass + class Nmt(Normalizer): """ Nmt normalizer @@ -305,6 +331,7 @@ class Nmt(Normalizer): def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -320,6 +347,7 @@ class Nmt(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -338,6 +366,7 @@ class Nmt(Normalizer): """ pass + class Precompiled(Normalizer): """ Precompiled normalizer @@ -346,6 +375,7 @@ class Precompiled(Normalizer): def __init__(self, precompiled_charsmap): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -361,6 +391,7 @@ class Precompiled(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -379,6 +410,7 @@ class Precompiled(Normalizer): """ pass + class Prepend(Normalizer): """ Prepend normalizer @@ -386,6 +418,7 @@ class Prepend(Normalizer): def __init__(self, prepend): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -401,6 +434,7 @@ class Prepend(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -419,6 +453,7 @@ class Prepend(Normalizer): """ pass + class Replace(Normalizer): """ Replace normalizer @@ -426,6 +461,7 @@ class Replace(Normalizer): def __init__(self, pattern, content): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -441,6 +477,7 @@ class Replace(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -459,6 +496,7 @@ class Replace(Normalizer): """ pass + class Sequence(Normalizer): """ Allows concatenating multiple other Normalizer as a Sequence. @@ -484,6 +522,7 @@ class Sequence(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -502,6 +541,7 @@ class Sequence(Normalizer): """ pass + class Strip(Normalizer): """ Strip normalizer @@ -509,6 +549,7 @@ class Strip(Normalizer): def __init__(self, left=True, right=True): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -524,6 +565,7 @@ class Strip(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -542,6 +584,7 @@ class Strip(Normalizer): """ pass + class StripAccents(Normalizer): """ StripAccents normalizer @@ -549,6 +592,7 @@ class StripAccents(Normalizer): def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -564,6 +608,7 @@ class StripAccents(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index e3cb84dd2..852a52e19 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -23,6 +23,7 @@ class PreTokenizer: :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -43,6 +44,7 @@ class PreTokenizer: """ pass + class BertPreTokenizer(PreTokenizer): """ BertPreTokenizer @@ -53,6 +55,7 @@ class BertPreTokenizer(PreTokenizer): def __init__(self): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -69,6 +72,7 @@ class BertPreTokenizer(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -89,6 +93,7 @@ class BertPreTokenizer(PreTokenizer): """ pass + class ByteLevel(PreTokenizer): """ ByteLevel PreTokenizer @@ -107,6 +112,7 @@ class ByteLevel(PreTokenizer): def __init__(self, add_prefix_space=True, use_regex=True): pass + @staticmethod def alphabet(): """ @@ -120,6 +126,7 @@ class ByteLevel(PreTokenizer): :obj:`List[str]`: A list of characters that compose the alphabet """ pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -136,6 +143,7 @@ class ByteLevel(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -156,6 +164,7 @@ class ByteLevel(PreTokenizer): """ pass + class CharDelimiterSplit(PreTokenizer): """ This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)` @@ -181,6 +190,7 @@ class CharDelimiterSplit(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -201,6 +211,7 @@ class CharDelimiterSplit(PreTokenizer): """ pass + class Digits(PreTokenizer): """ This pre-tokenizer simply splits using the digits in separate tokens @@ -218,6 +229,7 @@ class Digits(PreTokenizer): def __init__(self, individual_digits=False): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -234,6 +246,7 @@ class Digits(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -254,6 +267,7 @@ class Digits(PreTokenizer): """ pass + class Metaspace(PreTokenizer): """ Metaspace pre-tokenizer @@ -273,6 +287,7 @@ class Metaspace(PreTokenizer): def __init__(self, replacement="_", add_prefix_space=True): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -289,6 +304,7 @@ class Metaspace(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -309,6 +325,7 @@ class Metaspace(PreTokenizer): """ pass + class Punctuation(PreTokenizer): """ This pre-tokenizer simply splits on punctuation as individual characters. @@ -322,6 +339,7 @@ class Punctuation(PreTokenizer): def __init__(self, behavior="isolated"): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -338,6 +356,7 @@ class Punctuation(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -358,6 +377,7 @@ class Punctuation(PreTokenizer): """ pass + class Sequence(PreTokenizer): """ This pre-tokenizer composes other pre_tokenizers and applies them in sequence @@ -365,6 +385,7 @@ class Sequence(PreTokenizer): def __init__(self, pretokenizers): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -381,6 +402,7 @@ class Sequence(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -401,6 +423,7 @@ class Sequence(PreTokenizer): """ pass + class Split(PreTokenizer): """ Split PreTokenizer @@ -424,6 +447,7 @@ class Split(PreTokenizer): def __init__(self, pattern, behavior, invert=False): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -440,6 +464,7 @@ class Split(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -460,6 +485,7 @@ class Split(PreTokenizer): """ pass + class UnicodeScripts(PreTokenizer): """ This pre-tokenizer splits on characters that belong to different language family @@ -470,6 +496,7 @@ class UnicodeScripts(PreTokenizer): def __init__(self): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -486,6 +513,7 @@ class UnicodeScripts(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -506,6 +534,7 @@ class UnicodeScripts(PreTokenizer): """ pass + class Whitespace(PreTokenizer): """ This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` @@ -513,6 +542,7 @@ class Whitespace(PreTokenizer): def __init__(self): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -529,6 +559,7 @@ class Whitespace(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -549,6 +580,7 @@ class Whitespace(PreTokenizer): """ pass + class WhitespaceSplit(PreTokenizer): """ This pre-tokenizer simply splits on the whitespace. Works like `.split()` @@ -556,6 +588,7 @@ class WhitespaceSplit(PreTokenizer): def __init__(self): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -572,6 +605,7 @@ class WhitespaceSplit(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index ab73a337c..0b1652ca6 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -19,6 +19,7 @@ class PostProcessor: :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -38,6 +39,7 @@ class PostProcessor: """ pass + class BertProcessing(PostProcessor): """ This post-processor takes care of adding the special tokens needed by @@ -56,6 +58,7 @@ class BertProcessing(PostProcessor): def __init__(self, sep, cls): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -68,6 +71,7 @@ class BertProcessing(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -87,6 +91,7 @@ class BertProcessing(PostProcessor): """ pass + class ByteLevel(PostProcessor): """ This post-processor takes care of trimming the offsets. @@ -101,6 +106,7 @@ class ByteLevel(PostProcessor): def __init__(self, trim_offsets=True): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -113,6 +119,7 @@ class ByteLevel(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -132,6 +139,7 @@ class ByteLevel(PostProcessor): """ pass + class RobertaProcessing(PostProcessor): """ This post-processor takes care of adding the special tokens needed by @@ -162,6 +170,7 @@ class RobertaProcessing(PostProcessor): def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -174,6 +183,7 @@ class RobertaProcessing(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -193,6 +203,7 @@ class RobertaProcessing(PostProcessor): """ pass + class Sequence(PostProcessor): """ Sequence Processor @@ -204,6 +215,7 @@ class Sequence(PostProcessor): def __init__(self, processors): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -216,6 +228,7 @@ class Sequence(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -235,6 +248,7 @@ class Sequence(PostProcessor): """ pass + class TemplateProcessing(PostProcessor): """ Provides a way to specify templates in order to add the special tokens to each @@ -305,6 +319,7 @@ class TemplateProcessing(PostProcessor): def __init__(self, single, pair, special_tokens): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -317,6 +332,7 @@ class TemplateProcessing(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one diff --git a/bindings/python/py_src/tokenizers/tools/visualizer.py b/bindings/python/py_src/tokenizers/tools/visualizer.py index da368054c..df8b4b496 100644 --- a/bindings/python/py_src/tokenizers/tools/visualizer.py +++ b/bindings/python/py_src/tokenizers/tools/visualizer.py @@ -92,7 +92,7 @@ def __init__( if default_to_notebook: try: from IPython.core.display import HTML, display - except ImportError as e: + except ImportError: raise Exception( """We couldn't import IPython utils for html display. Are you running in a notebook? @@ -136,7 +136,7 @@ def __call__( if final_default_to_notebook: try: from IPython.core.display import HTML, display - except ImportError as e: + except ImportError: raise Exception( """We couldn't import IPython utils for html display. Are you running in a notebook?""" @@ -170,12 +170,12 @@ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]: if h_step < 20: h_step = 20 s = 32 - l = 64 + light = 64 h = 10 colors = {} for label in sorted(labels): # sort so we always get the same colors for a given set of labels - colors[label] = f"hsl({h},{s}%,{l}%" + colors[label] = f"hsl({h},{s}%,{light}%" h += h_step return colors diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index 911fdeb29..399340cc7 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -7,6 +7,7 @@ class Trainer: Trainer will return an instance of this class when instantiated. """ + class BpeTrainer(Trainer): """ Trainer capable of training a BPE model @@ -46,6 +47,7 @@ class BpeTrainer(Trainer): """ + class UnigramTrainer(Trainer): """ Trainer capable of training a Unigram model @@ -93,6 +95,7 @@ class UnigramTrainer(Trainer): ): pass + class WordLevelTrainer(Trainer): """ Trainer capable of training a WorldLevel model @@ -111,6 +114,7 @@ class WordLevelTrainer(Trainer): A list of special tokens the model should know of. """ + class WordPieceTrainer(Trainer): """ Trainer capable of training a WordPiece model diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 1b3c8df42..8b043de0c 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -52,3 +52,15 @@ features = ["pyo3/extension-module"] [tool.black] line-length = 119 target-version = ['py35'] + +[tool.ruff] +lint.ignore = [ + # a == None in tests vs is None. + "E711", + # a == False in tests vs is False. + "E712", + # try.. import except.. pattern without using the lib. + "F401", + # Raw type equality is required in asserts + "E721", +] diff --git a/bindings/python/stub.py b/bindings/python/stub.py index 6495d1fda..672eb803f 100644 --- a/bindings/python/stub.py +++ b/bindings/python/stub.py @@ -3,8 +3,6 @@ import os from pathlib import Path -import black - INDENT = " " * 4 GENERATED_COMMENT = "# Generated content DO NOT EDIT\n" @@ -124,7 +122,7 @@ def py_file(module, origin): import subprocess from typing import List, Optional, Tuple -def do_ruff(code): +def do_ruff(code, is_pyi: bool): command = ["ruff", "format", "-", "--config", "pyproject.toml", "--silent"] process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) stdout, _ = process.communicate(input=code.encode()) @@ -150,7 +148,7 @@ def write(module, directory, origin, check=False): filename = os.path.join(directory, "__init__.pyi") pyi_content = pyi_file(module) - pyi_content = do_black(pyi_content, is_pyi=True) + pyi_content = do_ruff(pyi_content, is_pyi=True) os.makedirs(directory, exist_ok=True) if check: with open(filename, "r") as f: @@ -162,7 +160,7 @@ def write(module, directory, origin, check=False): filename = os.path.join(directory, "__init__.py") py_content = py_file(module, origin) - py_content = do_black(py_content, is_pyi=False) + py_content = do_ruff(py_content, is_pyi=False) os.makedirs(directory, exist_ok=True) is_auto = False diff --git a/bindings/python/tests/bindings/test_encoding.py b/bindings/python/tests/bindings/test_encoding.py index 80b8cc2bb..189da4fee 100644 --- a/bindings/python/tests/bindings/test_encoding.py +++ b/bindings/python/tests/bindings/test_encoding.py @@ -2,7 +2,6 @@ from tokenizers import BertWordPieceTokenizer -from ..utils import bert_files, data_dir class TestEncoding: diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py index b4b29682d..cae7f2e6e 100644 --- a/bindings/python/tests/bindings/test_models.py +++ b/bindings/python/tests/bindings/test_models.py @@ -4,7 +4,6 @@ from tokenizers.models import BPE, Model, WordLevel, WordPiece -from ..utils import bert_files, data_dir, roberta_files class TestBPE: diff --git a/bindings/python/tests/bindings/test_normalizers.py b/bindings/python/tests/bindings/test_normalizers.py index cf9f3d1a4..3fafd60d1 100644 --- a/bindings/python/tests/bindings/test_normalizers.py +++ b/bindings/python/tests/bindings/test_normalizers.py @@ -2,8 +2,7 @@ import pytest -from tokenizers import NormalizedString, Tokenizer -from tokenizers.models import BPE +from tokenizers import NormalizedString from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 14af9fbe4..4522977f8 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -15,7 +15,6 @@ TemplateProcessing, ) -from ..utils import data_dir, roberta_files class TestBertProcessing: @@ -146,18 +145,18 @@ def test_instantiate(self): assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing) # It is absolutely legal to have tokens with spaces in the name: - processor = TemplateProcessing( + TemplateProcessing( single=["[ C L S ]", "Token with space"], special_tokens=[("[ C L S ]", 0), ("Token with space", 1)], ) # Sequence identifiers must be well formed: with pytest.raises(Exception, match="Cannot build Piece"): - processor = TemplateProcessing(single="[CLS] $$ [SEP]") + TemplateProcessing(single="[CLS] $$ [SEP]") with pytest.raises(Exception, match="Cannot build Piece"): - processor = TemplateProcessing(single="[CLS] $A: [SEP]") + TemplateProcessing(single="[CLS] $A: [SEP]") # Special tokens must be provided when used in template: with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"): - processor = TemplateProcessing(single=["[CLS]"]) + TemplateProcessing(single=["[CLS]"]) def test_bert_parity(self): tokenizer = Tokenizer(BPE()) diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index a1e41c283..362198f41 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -5,12 +5,11 @@ from tokenizers import AddedToken, Encoding, Tokenizer from tokenizers.implementations import BertWordPieceTokenizer -from tokenizers.models import BPE, Model, WordPiece, Unigram -from tokenizers.normalizers import Lowercase +from tokenizers.models import BPE, Model, Unigram from tokenizers.pre_tokenizers import ByteLevel -from tokenizers.processors import BertProcessing, RobertaProcessing +from tokenizers.processors import RobertaProcessing -from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files +from ..utils import multiprocessing_with_parallelism class TestAddedToken: diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py index 87021533c..3a799ab45 100644 --- a/bindings/python/tests/bindings/test_trainers.py +++ b/bindings/python/tests/bindings/test_trainers.py @@ -14,7 +14,6 @@ trainers, ) -from ..utils import data_dir, train_files class TestBpeTrainer: diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py index 90117f075..0a8c02530 100644 --- a/bindings/python/tests/documentation/test_pipeline.py +++ b/bindings/python/tests/documentation/test_pipeline.py @@ -1,6 +1,5 @@ from tokenizers import Tokenizer -from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer disable_printing = True diff --git a/bindings/python/tests/documentation/test_quicktour.py b/bindings/python/tests/documentation/test_quicktour.py index 866a6f99d..e81c642b4 100644 --- a/bindings/python/tests/documentation/test_quicktour.py +++ b/bindings/python/tests/documentation/test_quicktour.py @@ -1,9 +1,5 @@ from tokenizers import Tokenizer -from tokenizers.models import BPE -from tokenizers.pre_tokenizers import Whitespace -from tokenizers.trainers import BpeTrainer -from ..utils import data_dir, doc_wiki_tokenizer disable_printing = True diff --git a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py index bba55a48b..d6b4f321e 100644 --- a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py +++ b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py @@ -1,10 +1,10 @@ +# flake8: noqa import gzip import os import datasets import pytest -from ..utils import data_dir, train_files class TestTrainFromIterators: diff --git a/bindings/python/tests/implementations/test_base_tokenizer.py b/bindings/python/tests/implementations/test_base_tokenizer.py index 5b4c45160..d42c40c4e 100644 --- a/bindings/python/tests/implementations/test_base_tokenizer.py +++ b/bindings/python/tests/implementations/test_base_tokenizer.py @@ -1,4 +1,3 @@ -import pytest from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors from tokenizers.implementations import BaseTokenizer diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py index a05d98c94..4333bc87d 100644 --- a/bindings/python/tests/implementations/test_bert_wordpiece.py +++ b/bindings/python/tests/implementations/test_bert_wordpiece.py @@ -1,8 +1,7 @@ -import pytest from tokenizers import BertWordPieceTokenizer -from ..utils import bert_files, data_dir, multiprocessing_with_parallelism +from ..utils import multiprocessing_with_parallelism class TestBertWordPieceTokenizer: diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py index 579575d3f..3f063a24c 100644 --- a/bindings/python/tests/implementations/test_byte_level_bpe.py +++ b/bindings/python/tests/implementations/test_byte_level_bpe.py @@ -1,8 +1,7 @@ -import pytest from tokenizers import ByteLevelBPETokenizer -from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files +from ..utils import multiprocessing_with_parallelism class TestByteLevelBPE: diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py index 09b2fc6e1..de7db1c8c 100644 --- a/bindings/python/tests/implementations/test_char_bpe.py +++ b/bindings/python/tests/implementations/test_char_bpe.py @@ -1,8 +1,7 @@ -import pytest from tokenizers import CharBPETokenizer -from ..utils import data_dir, multiprocessing_with_parallelism, openai_files +from ..utils import multiprocessing_with_parallelism class TestCharBPETokenizer: diff --git a/bindings/python/tests/implementations/test_sentencepiece.py b/bindings/python/tests/implementations/test_sentencepiece.py index d9fade774..a7f7208a2 100644 --- a/bindings/python/tests/implementations/test_sentencepiece.py +++ b/bindings/python/tests/implementations/test_sentencepiece.py @@ -1,4 +1,3 @@ -import os import pytest diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py index 2057d763e..22ed21e90 100644 --- a/bindings/python/tests/test_serialization.py +++ b/bindings/python/tests/test_serialization.py @@ -7,7 +7,6 @@ from huggingface_hub import HfApi, cached_download, hf_hub_url from tokenizers import Tokenizer -from .utils import albert_base, data_dir class TestSerialization: @@ -15,7 +14,7 @@ def test_full_serialization_albert(self, albert_base): # Check we can read this file. # This used to fail because of BufReader that would fail because the # file exceeds the buffer capacity - tokenizer = Tokenizer.from_file(albert_base) + Tokenizer.from_file(albert_base) def check(tokenizer_file) -> bool: @@ -51,8 +50,6 @@ def test_full_deserialization_hub(self): # Check we can read this file. # This used to fail because of BufReader that would fail because the # file exceeds the buffer capacity - api = HfApi() - not_loadable = [] invalid_pre_tokenizer = [] @@ -77,7 +74,7 @@ def test_full_deserialization_hub(self): except Exception as e: print(f"{model_id} is not loadable: {e}") not_loadable.append(model_id) - except: + except: # noqa: E722 print(f"{model_id} is not loadable: Rust error") not_loadable.append(model_id)