diff --git a/bindings/python/Makefile b/bindings/python/Makefile index eb5d2c57c..f07cff584 100644 --- a/bindings/python/Makefile +++ b/bindings/python/Makefile @@ -9,13 +9,13 @@ check_dirs := examples py_src/tokenizers tests style: python stub.py ruff check $(check_dirs) --fix - ruff format --line-length 119 --target-version py35 $(check_dirs) + ruff format $(check_dirs) # Check the source code is formatted correctly check-style: python stub.py --check ruff check examples py_src/tokenizers tests - ruff format --check --line-length 119 --target-version py35 examples py_src/tokenizers tests + ruff format --check examples py_src/tokenizers tests TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json diff --git a/bindings/python/examples/example.py b/bindings/python/examples/example.py index c09d4c128..d62390644 100644 --- a/bindings/python/examples/example.py +++ b/bindings/python/examples/example.py @@ -14,7 +14,6 @@ logging.getLogger("transformers.tokenization_utils").disabled = True - parser = argparse.ArgumentParser() parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)") parser.add_argument("--file", default=None, type=str, help="The file to encode") @@ -51,9 +50,7 @@ If the implementation is hard to explain, it's a bad idea. If the implementation is easy to explain, it may be a good idea. Namespaces are one honking great idea -- let's do more of those! -""".split( - "\n" - ) +""".split("\n") if args.type == "gpt2": print("Running GPT-2 tokenizer") diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 9a8c500ce..5dbc665dc 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -34,16 +34,7 @@ class AddedToken: Defines whether this token should be skipped when decoding. """ - - def __init__( - self, - content, - single_word=False, - lstrip=False, - rstrip=False, - normalized=True, - special=False, - ): + def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False): pass @property @@ -88,12 +79,10 @@ class AddedToken: """ pass - class Encoding: """ The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`. """ - @property def attention_mask(self): """ @@ -206,9 +195,7 @@ class Encoding: """ pass - def pad( - self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]" - ): + def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"): """ Pad the :class:`~tokenizers.Encoding` at the given length @@ -430,7 +417,6 @@ class Encoding: """ pass - class NormalizedString: """ NormalizedString @@ -443,7 +429,6 @@ class NormalizedString: sequence: str: The string sequence used to initialize this NormalizedString """ - def append(self, s): """ Append the given sequence to the string @@ -581,7 +566,6 @@ class NormalizedString: """ pass - class PreTokenizedString: """ PreTokenizedString @@ -600,7 +584,6 @@ class PreTokenizedString: sequence: str: The string sequence used to initialize this PreTokenizedString """ - def __init__(self, sequence): pass @@ -681,20 +664,16 @@ class PreTokenizedString: """ pass - class Regex: """ Instantiate a new Regex with the given pattern """ - def __init__(self, pattern): pass - class Token: pass - class Tokenizer: """ A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input @@ -705,7 +684,6 @@ class Tokenizer: The core algorithm that this :obj:`Tokenizer` should be using. """ - def __init__(self, model): pass @@ -789,13 +767,7 @@ class Tokenizer: pass def enable_padding( - self, - direction="right", - pad_id=0, - pad_type_id=0, - pad_token="[PAD]", - length=None, - pad_to_multiple_of=None, + self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None ): """ Enable the padding @@ -824,9 +796,7 @@ class Tokenizer: """ pass - def enable_truncation( - self, max_length, stride=0, strategy="longest_first", direction="right" - ): + def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"): """ Enable truncation @@ -847,9 +817,7 @@ class Tokenizer: """ pass - def encode( - self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True - ): + def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True): """ Encode the given sequence and pair. This method can process raw text sequences as well as already pre-tokenized sequences. diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 2a8692f82..94dda2354 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -6,7 +6,6 @@ class Decoder: This class is not supposed to be instantiated directly. Instead, any implementation of a Decoder will return an instance of this class when instantiated. """ - def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -20,7 +19,6 @@ class Decoder: """ pass - class BPEDecoder(Decoder): """ BPEDecoder Decoder @@ -30,7 +28,6 @@ class BPEDecoder(Decoder): The suffix that was used to caracterize an end-of-word. This suffix will be replaced by whitespaces during the decoding """ - def __init__(self, suffix=""): pass @@ -47,7 +44,6 @@ class BPEDecoder(Decoder): """ pass - class ByteFallback(Decoder): """ ByteFallback Decoder @@ -56,7 +52,6 @@ class ByteFallback(Decoder): cannot be decoded you will get � instead for each inconvertable byte token """ - def __init__(self): pass @@ -73,7 +68,6 @@ class ByteFallback(Decoder): """ pass - class ByteLevel(Decoder): """ ByteLevel Decoder @@ -81,7 +75,6 @@ class ByteLevel(Decoder): This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel` :class:`~tokenizers.pre_tokenizers.PreTokenizer`. """ - def __init__(self): pass @@ -98,7 +91,6 @@ class ByteLevel(Decoder): """ pass - class CTC(Decoder): """ CTC Decoder @@ -112,7 +104,6 @@ class CTC(Decoder): Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, and some abbreviated english forms. """ - def __init__(self, pad_token="", word_delimiter_token="|", cleanup=True): pass @@ -129,7 +120,6 @@ class CTC(Decoder): """ pass - class Fuse(Decoder): """ Fuse Decoder @@ -137,7 +127,6 @@ class Fuse(Decoder): This is the last step of decoding, this decoder exists only if there is need to add other decoders *after* the fusion """ - def __init__(self): pass @@ -154,7 +143,6 @@ class Fuse(Decoder): """ pass - class Metaspace(Decoder): """ Metaspace Decoder @@ -168,7 +156,6 @@ class Metaspace(Decoder): Whether to add a space to the first word if there isn't already one. This lets us treat `hello` exactly like `say hello`. """ - def __init__(self, replacement="▁", add_prefix_space=True): pass @@ -185,7 +172,6 @@ class Metaspace(Decoder): """ pass - class Replace(Decoder): """ Replace Decoder @@ -193,7 +179,6 @@ class Replace(Decoder): This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace` :class:`~tokenizers.pre_tokenizers.PreTokenizer`. """ - def __init__(self, pattern, content): pass @@ -210,7 +195,6 @@ class Replace(Decoder): """ pass - class Sequence(Decoder): """ Sequence Decoder @@ -219,7 +203,6 @@ class Sequence(Decoder): decoders (:obj:`List[Decoder]`) The decoders that need to be chained """ - def __init__(self, decoders): pass @@ -236,13 +219,11 @@ class Sequence(Decoder): """ pass - class Strip(Decoder): """ Strip normalizer Strips n left characters of each token, or n right characters of each token """ - def __init__(self, content, left=0, right=0): pass @@ -259,7 +240,6 @@ class Strip(Decoder): """ pass - class WordPiece(Decoder): """ WordPiece Decoder @@ -272,7 +252,6 @@ class WordPiece(Decoder): Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, and some abbreviated english forms. """ - def __init__(self, prefix="##", cleanup=True): pass diff --git a/bindings/python/py_src/tokenizers/models/__init__.py b/bindings/python/py_src/tokenizers/models/__init__.py index 68ac211aa..e69de29bb 100644 --- a/bindings/python/py_src/tokenizers/models/__init__.py +++ b/bindings/python/py_src/tokenizers/models/__init__.py @@ -1,8 +0,0 @@ -# Generated content DO NOT EDIT -from .. import models - -Model = models.Model -BPE = models.BPE -Unigram = models.Unigram -WordLevel = models.WordLevel -WordPiece = models.WordPiece diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi index 9c747e611..b46f32f25 100644 --- a/bindings/python/py_src/tokenizers/models/__init__.pyi +++ b/bindings/python/py_src/tokenizers/models/__init__.pyi @@ -8,7 +8,6 @@ class Model: This class cannot be constructed directly. Please use one of the concrete models. """ - def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -80,7 +79,6 @@ class Model: """ pass - class BPE(Model): """ An implementation of the BPE (Byte-Pair Encoding) algorithm @@ -115,7 +113,6 @@ class BPE(Model): byte_fallback (:obj:`bool`, `optional`): Whether to use spm byte-fallback trick (defaults to False) """ - def __init__( self, vocab=None, @@ -249,7 +246,6 @@ class BPE(Model): """ pass - class Unigram(Model): """ An implementation of the Unigram algorithm @@ -258,7 +254,6 @@ class Unigram(Model): vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`): A list of vocabulary items and their relative score [("am", -0.2442),...] """ - def __init__(self, vocab, unk_id, byte_fallback): pass @@ -333,7 +328,6 @@ class Unigram(Model): """ pass - class WordLevel(Model): """ An implementation of the WordLevel algorithm @@ -347,7 +341,6 @@ class WordLevel(Model): unk_token (:obj:`str`, `optional`): The unknown token to be used by the model. """ - def __init__(self, vocab, unk_token): pass @@ -463,7 +456,6 @@ class WordLevel(Model): """ pass - class WordPiece(Model): """ An implementation of the WordPiece algorithm @@ -478,7 +470,6 @@ class WordPiece(Model): max_input_chars_per_word (:obj:`int`, `optional`): The maximum number of characters to authorize in a single word. """ - def __init__(self, vocab, unk_token, max_input_chars_per_word): pass diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi index 996c255ca..507d44731 100644 --- a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi @@ -6,7 +6,6 @@ class Normalizer: This class is not supposed to be instantiated directly. Instead, any implementation of a Normalizer will return an instance of this class when instantiated. """ - def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -41,7 +40,6 @@ class Normalizer: """ pass - class BertNormalizer(Normalizer): """ BertNormalizer @@ -64,14 +62,7 @@ class BertNormalizer(Normalizer): lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase. """ - - def __init__( - self, - clean_text=True, - handle_chinese_chars=True, - strip_accents=None, - lowercase=True, - ): + def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True): pass def normalize(self, normalized): @@ -108,12 +99,10 @@ class BertNormalizer(Normalizer): """ pass - class Lowercase(Normalizer): """ Lowercase Normalizer """ - def __init__(self): pass @@ -151,12 +140,10 @@ class Lowercase(Normalizer): """ pass - class NFC(Normalizer): """ NFC Unicode Normalizer """ - def __init__(self): pass @@ -194,12 +181,10 @@ class NFC(Normalizer): """ pass - class NFD(Normalizer): """ NFD Unicode Normalizer """ - def __init__(self): pass @@ -237,12 +222,10 @@ class NFD(Normalizer): """ pass - class NFKC(Normalizer): """ NFKC Unicode Normalizer """ - def __init__(self): pass @@ -280,12 +263,10 @@ class NFKC(Normalizer): """ pass - class NFKD(Normalizer): """ NFKD Unicode Normalizer """ - def __init__(self): pass @@ -323,12 +304,10 @@ class NFKD(Normalizer): """ pass - class Nmt(Normalizer): """ Nmt normalizer """ - def __init__(self): pass @@ -366,13 +345,11 @@ class Nmt(Normalizer): """ pass - class Precompiled(Normalizer): """ Precompiled normalizer Don't use manually it is used for compatiblity for SentencePiece. """ - def __init__(self, precompiled_charsmap): pass @@ -410,12 +387,10 @@ class Precompiled(Normalizer): """ pass - class Prepend(Normalizer): """ Prepend normalizer """ - def __init__(self, prepend): pass @@ -453,12 +428,10 @@ class Prepend(Normalizer): """ pass - class Replace(Normalizer): """ Replace normalizer """ - def __init__(self, pattern, content): pass @@ -496,7 +469,6 @@ class Replace(Normalizer): """ pass - class Sequence(Normalizer): """ Allows concatenating multiple other Normalizer as a Sequence. @@ -506,7 +478,6 @@ class Sequence(Normalizer): normalizers (:obj:`List[Normalizer]`): A list of Normalizer to be run as a sequence """ - def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -541,12 +512,10 @@ class Sequence(Normalizer): """ pass - class Strip(Normalizer): """ Strip normalizer """ - def __init__(self, left=True, right=True): pass @@ -584,12 +553,10 @@ class Strip(Normalizer): """ pass - class StripAccents(Normalizer): """ StripAccents normalizer """ - def __init__(self): pass diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py index 48277f0d2..e69de29bb 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py @@ -1,15 +0,0 @@ -# Generated content DO NOT EDIT -from .. import pre_tokenizers - -PreTokenizer = pre_tokenizers.PreTokenizer -BertPreTokenizer = pre_tokenizers.BertPreTokenizer -ByteLevel = pre_tokenizers.ByteLevel -CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit -Digits = pre_tokenizers.Digits -Metaspace = pre_tokenizers.Metaspace -Punctuation = pre_tokenizers.Punctuation -Sequence = pre_tokenizers.Sequence -Split = pre_tokenizers.Split -UnicodeScripts = pre_tokenizers.UnicodeScripts -Whitespace = pre_tokenizers.Whitespace -WhitespaceSplit = pre_tokenizers.WhitespaceSplit diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index 852a52e19..9e975326f 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -6,7 +6,6 @@ class PreTokenizer: This class is not supposed to be instantiated directly. Instead, any implementation of a PreTokenizer will return an instance of this class when instantiated. """ - def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -44,7 +43,6 @@ class PreTokenizer: """ pass - class BertPreTokenizer(PreTokenizer): """ BertPreTokenizer @@ -52,7 +50,6 @@ class BertPreTokenizer(PreTokenizer): This pre-tokenizer splits tokens on spaces, and also on punctuation. Each occurence of a punctuation character will be treated separately. """ - def __init__(self): pass @@ -93,7 +90,6 @@ class BertPreTokenizer(PreTokenizer): """ pass - class ByteLevel(PreTokenizer): """ ByteLevel PreTokenizer @@ -109,7 +105,6 @@ class ByteLevel(PreTokenizer): Set this to :obj:`False` to prevent this `pre_tokenizer` from using the GPT2 specific regexp for spliting on whitespace. """ - def __init__(self, add_prefix_space=True, use_regex=True): pass @@ -164,7 +159,6 @@ class ByteLevel(PreTokenizer): """ pass - class CharDelimiterSplit(PreTokenizer): """ This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)` @@ -173,7 +167,6 @@ class CharDelimiterSplit(PreTokenizer): delimiter: str: The delimiter char that will be used to split input """ - def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -211,7 +204,6 @@ class CharDelimiterSplit(PreTokenizer): """ pass - class Digits(PreTokenizer): """ This pre-tokenizer simply splits using the digits in separate tokens @@ -226,7 +218,6 @@ class Digits(PreTokenizer): "Call 123 please" -> "Call ", "123", " please" """ - def __init__(self, individual_digits=False): pass @@ -267,7 +258,6 @@ class Digits(PreTokenizer): """ pass - class Metaspace(PreTokenizer): """ Metaspace pre-tokenizer @@ -284,7 +274,6 @@ class Metaspace(PreTokenizer): Whether to add a space to the first word if there isn't already one. This lets us treat `hello` exactly like `say hello`. """ - def __init__(self, replacement="_", add_prefix_space=True): pass @@ -325,7 +314,6 @@ class Metaspace(PreTokenizer): """ pass - class Punctuation(PreTokenizer): """ This pre-tokenizer simply splits on punctuation as individual characters. @@ -336,7 +324,6 @@ class Punctuation(PreTokenizer): Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next", "contiguous" """ - def __init__(self, behavior="isolated"): pass @@ -377,12 +364,10 @@ class Punctuation(PreTokenizer): """ pass - class Sequence(PreTokenizer): """ This pre-tokenizer composes other pre_tokenizers and applies them in sequence """ - def __init__(self, pretokenizers): pass @@ -423,7 +408,6 @@ class Sequence(PreTokenizer): """ pass - class Split(PreTokenizer): """ Split PreTokenizer @@ -444,7 +428,6 @@ class Split(PreTokenizer): invert (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to invert the pattern. """ - def __init__(self, pattern, behavior, invert=False): pass @@ -485,7 +468,6 @@ class Split(PreTokenizer): """ pass - class UnicodeScripts(PreTokenizer): """ This pre-tokenizer splits on characters that belong to different language family @@ -493,7 +475,6 @@ class UnicodeScripts(PreTokenizer): Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. This mimicks SentencePiece Unigram implementation. """ - def __init__(self): pass @@ -534,12 +515,10 @@ class UnicodeScripts(PreTokenizer): """ pass - class Whitespace(PreTokenizer): """ This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` """ - def __init__(self): pass @@ -580,12 +559,10 @@ class Whitespace(PreTokenizer): """ pass - class WhitespaceSplit(PreTokenizer): """ This pre-tokenizer simply splits on the whitespace. Works like `.split()` """ - def __init__(self): pass diff --git a/bindings/python/py_src/tokenizers/processors/__init__.py b/bindings/python/py_src/tokenizers/processors/__init__.py index 06d124037..e69de29bb 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.py +++ b/bindings/python/py_src/tokenizers/processors/__init__.py @@ -1,9 +0,0 @@ -# Generated content DO NOT EDIT -from .. import processors - -PostProcessor = processors.PostProcessor -BertProcessing = processors.BertProcessing -ByteLevel = processors.ByteLevel -RobertaProcessing = processors.RobertaProcessing -Sequence = processors.Sequence -TemplateProcessing = processors.TemplateProcessing diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index 0b1652ca6..5136d02bb 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -6,7 +6,6 @@ class PostProcessor: This class is not supposed to be instantiated directly. Instead, any implementation of a PostProcessor will return an instance of this class when instantiated. """ - def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -39,7 +38,6 @@ class PostProcessor: """ pass - class BertProcessing(PostProcessor): """ This post-processor takes care of adding the special tokens needed by @@ -55,7 +53,6 @@ class BertProcessing(PostProcessor): cls (:obj:`Tuple[str, int]`): A tuple with the string representation of the CLS token, and its id """ - def __init__(self, sep, cls): pass @@ -91,7 +88,6 @@ class BertProcessing(PostProcessor): """ pass - class ByteLevel(PostProcessor): """ This post-processor takes care of trimming the offsets. @@ -103,7 +99,6 @@ class ByteLevel(PostProcessor): trim_offsets (:obj:`bool`): Whether to trim the whitespaces from the produced offsets. """ - def __init__(self, trim_offsets=True): pass @@ -139,7 +134,6 @@ class ByteLevel(PostProcessor): """ pass - class RobertaProcessing(PostProcessor): """ This post-processor takes care of adding the special tokens needed by @@ -167,7 +161,6 @@ class RobertaProcessing(PostProcessor): Whether the add_prefix_space option was enabled during pre-tokenization. This is relevant because it defines the way the offsets are trimmed out. """ - def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True): pass @@ -203,7 +196,6 @@ class RobertaProcessing(PostProcessor): """ pass - class Sequence(PostProcessor): """ Sequence Processor @@ -212,7 +204,6 @@ class Sequence(PostProcessor): processors (:obj:`List[PostProcessor]`) The processors that need to be chained """ - def __init__(self, processors): pass @@ -248,7 +239,6 @@ class Sequence(PostProcessor): """ pass - class TemplateProcessing(PostProcessor): """ Provides a way to specify templates in order to add the special tokens to each @@ -316,7 +306,6 @@ class TemplateProcessing(PostProcessor): The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have the same length. """ - def __init__(self, single, pair, special_tokens): pass diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.py b/bindings/python/py_src/tokenizers/trainers/__init__.py index 22f94c50b..e69de29bb 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.py +++ b/bindings/python/py_src/tokenizers/trainers/__init__.py @@ -1,8 +0,0 @@ -# Generated content DO NOT EDIT -from .. import trainers - -Trainer = trainers.Trainer -BpeTrainer = trainers.BpeTrainer -UnigramTrainer = trainers.UnigramTrainer -WordLevelTrainer = trainers.WordLevelTrainer -WordPieceTrainer = trainers.WordPieceTrainer diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index 399340cc7..d6c525718 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -7,7 +7,6 @@ class Trainer: Trainer will return an instance of this class when instantiated. """ - class BpeTrainer(Trainer): """ Trainer capable of training a BPE model @@ -47,7 +46,6 @@ class BpeTrainer(Trainer): """ - class UnigramTrainer(Trainer): """ Trainer capable of training a Unigram model @@ -82,7 +80,6 @@ class UnigramTrainer(Trainer): The number of iterations of the EM algorithm to perform before pruning the vocabulary. """ - def __init__( self, vocab_size=8000, @@ -95,7 +92,6 @@ class UnigramTrainer(Trainer): ): pass - class WordLevelTrainer(Trainer): """ Trainer capable of training a WorldLevel model @@ -114,7 +110,6 @@ class WordLevelTrainer(Trainer): A list of special tokens the model should know of. """ - class WordPieceTrainer(Trainer): """ Trainer capable of training a WordPiece model @@ -147,7 +142,6 @@ class WordPieceTrainer(Trainer): end_of_word_suffix (:obj:`str`, `optional`): A suffix to be used for every subword that is a end-of-word. """ - def __init__( self, vocab_size=30000, diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index cbe463370..8702fc37b 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -54,6 +54,8 @@ line-length = 119 target-version = ['py35'] [tool.ruff] +line-length = 119 +target-version = "py311" lint.ignore = [ # a == None in tests vs is None. "E711", diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py index 6c812f8c7..e6df5ad8a 100644 --- a/bindings/python/scripts/convert.py +++ b/bindings/python/scripts/convert.py @@ -80,9 +80,7 @@ def tokenizer(self, proto): tokenizer = Tokenizer(Unigram(vocab, unk_id)) elif model_type == 2: vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract() - tokenizer = Tokenizer( - BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True) - ) + tokenizer = Tokenizer(BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True)) else: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" @@ -105,12 +103,8 @@ def converted(self): replacement = "▁" add_prefix_space = True - tokenizer.pre_tokenizer = Metaspace( - replacement=replacement, add_prefix_space=add_prefix_space - ) - tokenizer.decoder = decoders.Metaspace( - replacement=replacement, add_prefix_space=add_prefix_space - ) + tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) + tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) post_processor = self.post_processor(tokenizer) if post_processor: tokenizer.post_processor = post_processor @@ -124,9 +118,7 @@ def converted(self): class AlbertConverter(SpmConverter): def vocab(self, proto): return [ - (piece.piece, piece.score) - if check_number_comma(piece.piece) - else (piece.piece, piece.score - 100) + (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100) for piece in proto.pieces ] @@ -261,9 +253,7 @@ def post_processor(self, tokenizer): class XLNetConverter(SpmConverter): def vocab(self, proto): return [ - (piece.piece, piece.score) - if check_number_comma(piece.piece) - else (piece.piece, piece.score - 100) + (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100) for piece in proto.pieces ] @@ -420,9 +410,7 @@ def main(): print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|") for pretrained in args.models: status, speedup = check(pretrained, args.filename) - print( - f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|" - ) + print(f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|") if __name__ == "__main__": diff --git a/bindings/python/scripts/sentencepiece_extractor.py b/bindings/python/scripts/sentencepiece_extractor.py index fba05d8f4..a7bce9b49 100644 --- a/bindings/python/scripts/sentencepiece_extractor.py +++ b/bindings/python/scripts/sentencepiece_extractor.py @@ -59,7 +59,6 @@ def __init__(self, model: str): def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: with open(self._model, "r") as model_f: - # Retrieve information nb_pieces, nb_merges = map(int, model_f.readline().split()) vocab, merges = {}, [] @@ -97,9 +96,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: choices=["sentencepiece", "youtokentome"], help="Indicate the format of the file.", ) - parser.add_argument( - "--model", type=str, required=True, help="SentencePiece model to extract vocab from." - ) + parser.add_argument("--model", type=str, required=True, help="SentencePiece model to extract vocab from.") parser.add_argument( "--vocab-output-path", type=str, @@ -128,9 +125,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: args.model = f.name # Allocate extractor - extractor = ( - SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor - ) + extractor = SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor extractor = extractor(args.model) logger.info(f"Using {type(extractor).__name__}") diff --git a/bindings/python/scripts/spm_parity_check.py b/bindings/python/scripts/spm_parity_check.py index 09e5b9475..33cfff4fa 100644 --- a/bindings/python/scripts/spm_parity_check.py +++ b/bindings/python/scripts/spm_parity_check.py @@ -121,9 +121,7 @@ def check_train(args): break print(f"Tokenizer used {tokenizer_tokens}, where spm used {spm_tokens}") - assert ( - tokenizer_tokens < spm_tokens - ), "Our trainer should be at least more efficient than the SPM one" + assert tokenizer_tokens < spm_tokens, "Our trainer should be at least more efficient than the SPM one" print("Ok our trainer is at least more efficient than the SPM one") @@ -131,9 +129,7 @@ def check_diff(spm_diff, tok_diff, sp, tok): if spm_diff == list(reversed(tok_diff)): # AAA -> AA+A vs A+AA case. return True - elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode( - tok_diff - ): + elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(tok_diff): # Second order OK # Barrich -> Barr + ich vs Bar + rich return True @@ -173,24 +169,17 @@ def check_details(line, spm_ids, tok_ids, sp, tok): spms = Counter(spm_ids[first:last]) toks = Counter(tok_ids[first:last]) - removable_tokens = { - spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si - } + removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si} min_width = 3 for i in range(last - first - min_width): - if all( - spm_ids[first + i + j] in removable_tokens for j in range(min_width) - ): + if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)): possible_matches = [ k for k in range(last - first - min_width) - if tok_ids[first + k : first + k + min_width] - == spm_ids[first + i : first + i + min_width] + if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width] ] for j in possible_matches: - if check_diff( - spm_ids[first : first + i], tok_ids[first : first + j], sp, tok - ) and check_details( + if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], sp, tok) and check_details( line, spm_ids[first + i : last], tok_ids[first + j : last], @@ -210,9 +199,7 @@ def check_details(line, spm_ids, tok_ids, sp, tok): wrong = tok.decode(spm_ids[first:last]) print() if has_color: - print( - f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}" - ) + print(f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}") else: print(wrong) return False @@ -251,9 +238,7 @@ def check_encode(args): if args.verbose: if i % 10000 == 0: - print( - f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})" - ) + print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})") print(f"SPM: {spm_total_time} - TOK: {tok_total_time}") if ids != encoded.ids: @@ -265,13 +250,13 @@ def check_encode(args): else: perfect += 1 - assert ids == encoded.ids, f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}" + assert ( + ids == encoded.ids + ), f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}" print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})") total = perfect + imperfect + wrong - print( - f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}" - ) + print(f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}") if __name__ == "__main__": diff --git a/bindings/python/stub.py b/bindings/python/stub.py index 672eb803f..41ef2d6ec 100644 --- a/bindings/python/stub.py +++ b/bindings/python/stub.py @@ -83,7 +83,7 @@ def pyi_file(obj, indent=""): body += f"{indent+INDENT}pass\n" body += "\n" - for (name, fn) in fns: + for name, fn in fns: body += pyi_file(fn, indent=indent) if not body: @@ -119,28 +119,18 @@ def py_file(module, origin): string += f"{name} = {origin}.{name}\n" return string + import subprocess from typing import List, Optional, Tuple + def do_ruff(code, is_pyi: bool): - command = ["ruff", "format", "-", "--config", "pyproject.toml", "--silent"] + command = ["ruff", "format", "--config", "pyproject.toml", "--silent", "-"] + if is_pyi: + command.extend(["--stdin-filename", "test.pyi"]) process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, _ = process.communicate(input=code.encode()) - return stdout.decode() - - -def do_black(content, is_pyi): - mode = black.Mode( - target_versions={black.TargetVersion.PY35}, - line_length=119, - is_pyi=is_pyi, - string_normalization=True, - experimental_string_processing=False, - ) - try: - return black.format_file_contents(content, fast=True, mode=mode) - except black.NothingChanged: - return content + stdout, _ = process.communicate(input=code.encode("utf-8")) + return stdout.decode("utf-8") def write(module, directory, origin, check=False): diff --git a/bindings/python/tests/bindings/test_encoding.py b/bindings/python/tests/bindings/test_encoding.py index 189da4fee..e82fc6dda 100644 --- a/bindings/python/tests/bindings/test_encoding.py +++ b/bindings/python/tests/bindings/test_encoding.py @@ -3,7 +3,6 @@ from tokenizers import BertWordPieceTokenizer - class TestEncoding: @pytest.fixture(scope="class") def encodings(self, bert_files): diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py index cae7f2e6e..919b5043b 100644 --- a/bindings/python/tests/bindings/test_models.py +++ b/bindings/python/tests/bindings/test_models.py @@ -5,7 +5,6 @@ from tokenizers.models import BPE, Model, WordLevel, WordPiece - class TestBPE: def test_instantiate(self, roberta_files): assert isinstance(BPE(), Model) diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 4522977f8..f30c93d2f 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -16,7 +16,6 @@ ) - class TestBertProcessing: def test_instantiate(self): processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py index 3a799ab45..0f406c288 100644 --- a/bindings/python/tests/bindings/test_trainers.py +++ b/bindings/python/tests/bindings/test_trainers.py @@ -15,7 +15,6 @@ ) - class TestBpeTrainer: def test_can_modify(self): trainer = trainers.BpeTrainer( diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py index 0a8c02530..ae8127be6 100644 --- a/bindings/python/tests/documentation/test_pipeline.py +++ b/bindings/python/tests/documentation/test_pipeline.py @@ -1,7 +1,6 @@ from tokenizers import Tokenizer - disable_printing = True original_print = print diff --git a/bindings/python/tests/documentation/test_quicktour.py b/bindings/python/tests/documentation/test_quicktour.py index e81c642b4..8bca8d680 100644 --- a/bindings/python/tests/documentation/test_quicktour.py +++ b/bindings/python/tests/documentation/test_quicktour.py @@ -1,7 +1,6 @@ from tokenizers import Tokenizer - disable_printing = True original_print = print diff --git a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py index f0934cf11..2ba51c4be 100644 --- a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py +++ b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py @@ -6,7 +6,6 @@ import pytest - class TestTrainFromIterators: @staticmethod def get_tokenizer_trainer(): diff --git a/bindings/python/tests/implementations/test_base_tokenizer.py b/bindings/python/tests/implementations/test_base_tokenizer.py index d42c40c4e..535964656 100644 --- a/bindings/python/tests/implementations/test_base_tokenizer.py +++ b/bindings/python/tests/implementations/test_base_tokenizer.py @@ -1,4 +1,3 @@ - from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors from tokenizers.implementations import BaseTokenizer diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py index 4333bc87d..2c47beb68 100644 --- a/bindings/python/tests/implementations/test_bert_wordpiece.py +++ b/bindings/python/tests/implementations/test_bert_wordpiece.py @@ -1,4 +1,3 @@ - from tokenizers import BertWordPieceTokenizer from ..utils import multiprocessing_with_parallelism diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py index 3f063a24c..d6af6e31e 100644 --- a/bindings/python/tests/implementations/test_byte_level_bpe.py +++ b/bindings/python/tests/implementations/test_byte_level_bpe.py @@ -1,4 +1,3 @@ - from tokenizers import ByteLevelBPETokenizer from ..utils import multiprocessing_with_parallelism diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py index de7db1c8c..b786ca850 100644 --- a/bindings/python/tests/implementations/test_char_bpe.py +++ b/bindings/python/tests/implementations/test_char_bpe.py @@ -1,4 +1,3 @@ - from tokenizers import CharBPETokenizer from ..utils import multiprocessing_with_parallelism diff --git a/bindings/python/tests/implementations/test_sentencepiece.py b/bindings/python/tests/implementations/test_sentencepiece.py index a7f7208a2..1da41fec0 100644 --- a/bindings/python/tests/implementations/test_sentencepiece.py +++ b/bindings/python/tests/implementations/test_sentencepiece.py @@ -1,4 +1,3 @@ - import pytest from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py index 22ed21e90..d0111a88e 100644 --- a/bindings/python/tests/test_serialization.py +++ b/bindings/python/tests/test_serialization.py @@ -8,7 +8,6 @@ from tokenizers import Tokenizer - class TestSerialization: def test_full_serialization_albert(self, albert_base): # Check we can read this file. @@ -74,7 +73,7 @@ def test_full_deserialization_hub(self): except Exception as e: print(f"{model_id} is not loadable: {e}") not_loadable.append(model_id) - except: # noqa: E722 + except: # noqa: E722 print(f"{model_id} is not loadable: Rust error") not_loadable.append(model_id)