Fix.

huggingface · Mar 11, 2024 · 180ee65 · 180ee65
1 parent 732d0ca
commit 180ee65
Show file tree

Hide file tree

Showing 31 changed files with 40 additions and 271 deletions.
diff --git a/bindings/python/Makefile b/bindings/python/Makefile
@@ -9,13 +9,13 @@ check_dirs := examples py_src/tokenizers tests
 style:
 	python stub.py
 	ruff check  $(check_dirs) --fix 
-	ruff format --line-length 119 --target-version py35 $(check_dirs)
+	ruff format $(check_dirs)
 
 # Check the source code is formatted correctly
 check-style:
 	python stub.py --check
 	ruff check examples py_src/tokenizers tests 
-	ruff format --check --line-length 119 --target-version py35 examples py_src/tokenizers tests 
+	ruff format --check examples py_src/tokenizers tests 
 
 TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
 

diff --git a/bindings/python/examples/example.py b/bindings/python/examples/example.py
@@ -14,7 +14,6 @@
 logging.getLogger("transformers.tokenization_utils").disabled = True
 
 
-
 parser = argparse.ArgumentParser()
 parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")
 parser.add_argument("--file", default=None, type=str, help="The file to encode")
@@ -51,9 +50,7 @@
 If the implementation is hard to explain, it's a bad idea.
 If the implementation is easy to explain, it may be a good idea.
 Namespaces are one honking great idea -- let's do more of those!
-""".split(
-        "\n"
-    )
+""".split("\n")
 
 if args.type == "gpt2":
     print("Running GPT-2 tokenizer")

diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi
@@ -34,16 +34,7 @@ class AddedToken:
             Defines whether this token should be skipped when decoding.
 
     """
-
-    def __init__(
-        self,
-        content,
-        single_word=False,
-        lstrip=False,
-        rstrip=False,
-        normalized=True,
-        special=False,
-    ):
+    def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
         pass
 
     @property
@@ -88,12 +79,10 @@ class AddedToken:
         """
         pass
 
-
 class Encoding:
     """
     The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
     """
-
     @property
     def attention_mask(self):
         """
@@ -206,9 +195,7 @@ class Encoding:
         """
         pass
 
-    def pad(
-        self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"
-    ):
+    def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
         """
         Pad the :class:`~tokenizers.Encoding` at the given length
 
@@ -430,7 +417,6 @@ class Encoding:
         """
         pass
 
-
 class NormalizedString:
     """
     NormalizedString
@@ -443,7 +429,6 @@ class NormalizedString:
         sequence: str:
             The string sequence used to initialize this NormalizedString
     """
-
     def append(self, s):
         """
         Append the given sequence to the string
@@ -581,7 +566,6 @@ class NormalizedString:
         """
         pass
 
-
 class PreTokenizedString:
     """
     PreTokenizedString
@@ -600,7 +584,6 @@ class PreTokenizedString:
         sequence: str:
             The string sequence used to initialize this PreTokenizedString
     """
-
     def __init__(self, sequence):
         pass
 
@@ -681,20 +664,16 @@ class PreTokenizedString:
         """
         pass
 
-
 class Regex:
     """
     Instantiate a new Regex with the given pattern
     """
-
     def __init__(self, pattern):
         pass
 
-
 class Token:
     pass
 
-
 class Tokenizer:
     """
     A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
@@ -705,7 +684,6 @@ class Tokenizer:
             The core algorithm that this :obj:`Tokenizer` should be using.
 
     """
-
     def __init__(self, model):
         pass
 
@@ -789,13 +767,7 @@ class Tokenizer:
         pass
 
     def enable_padding(
-        self,
-        direction="right",
-        pad_id=0,
-        pad_type_id=0,
-        pad_token="[PAD]",
-        length=None,
-        pad_to_multiple_of=None,
+        self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
     ):
         """
         Enable the padding
@@ -824,9 +796,7 @@ class Tokenizer:
         """
         pass
 
-    def enable_truncation(
-        self, max_length, stride=0, strategy="longest_first", direction="right"
-    ):
+    def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
         """
         Enable truncation
 
@@ -847,9 +817,7 @@ class Tokenizer:
         """
         pass
 
-    def encode(
-        self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True
-    ):
+    def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
         """
         Encode the given sequence and pair. This method can process raw text sequences
         as well as already pre-tokenized sequences.

diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi
@@ -6,7 +6,6 @@ class Decoder:
     This class is not supposed to be instantiated directly. Instead, any implementation of
     a Decoder will return an instance of this class when instantiated.
     """
-
     def decode(self, tokens):
         """
         Decode the given list of tokens to a final string
@@ -20,7 +19,6 @@ class Decoder:
         """
         pass
 
-
 class BPEDecoder(Decoder):
     """
     BPEDecoder Decoder
@@ -30,7 +28,6 @@ class BPEDecoder(Decoder):
             The suffix that was used to caracterize an end-of-word. This suffix will
             be replaced by whitespaces during the decoding
     """
-
     def __init__(self, suffix="</w>"):
         pass
 
@@ -47,7 +44,6 @@ class BPEDecoder(Decoder):
         """
         pass
 
-
 class ByteFallback(Decoder):
     """
     ByteFallback Decoder
@@ -56,7 +52,6 @@ class ByteFallback(Decoder):
     cannot be decoded you will get � instead for each inconvertable byte token
 
     """
-
     def __init__(self):
         pass
 
@@ -73,15 +68,13 @@ class ByteFallback(Decoder):
         """
         pass
 
-
 class ByteLevel(Decoder):
     """
     ByteLevel Decoder
 
     This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
     :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
     """
-
     def __init__(self):
         pass
 
@@ -98,7 +91,6 @@ class ByteLevel(Decoder):
         """
         pass
 
-
 class CTC(Decoder):
     """
     CTC Decoder
@@ -112,7 +104,6 @@ class CTC(Decoder):
             Whether to cleanup some tokenization artifacts.
             Mainly spaces before punctuation, and some abbreviated english forms.
     """
-
     def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
         pass
 
@@ -129,15 +120,13 @@ class CTC(Decoder):
         """
         pass
 
-
 class Fuse(Decoder):
     """
     Fuse Decoder
     Fuse simply fuses every token into a single string.
     This is the last step of decoding, this decoder exists only if
     there is need to add other decoders *after* the fusion
     """
-
     def __init__(self):
         pass
 
@@ -154,7 +143,6 @@ class Fuse(Decoder):
         """
         pass
 
-
 class Metaspace(Decoder):
     """
     Metaspace Decoder
@@ -168,7 +156,6 @@ class Metaspace(Decoder):
             Whether to add a space to the first word if there isn't already one. This
             lets us treat `hello` exactly like `say hello`.
     """
-
     def __init__(self, replacement="▁", add_prefix_space=True):
         pass
 
@@ -185,15 +172,13 @@ class Metaspace(Decoder):
         """
         pass
 
-
 class Replace(Decoder):
     """
     Replace Decoder
 
     This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
     :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
     """
-
     def __init__(self, pattern, content):
         pass
 
@@ -210,7 +195,6 @@ class Replace(Decoder):
         """
         pass
 
-
 class Sequence(Decoder):
     """
     Sequence Decoder
@@ -219,7 +203,6 @@ class Sequence(Decoder):
         decoders (:obj:`List[Decoder]`)
             The decoders that need to be chained
     """
-
     def __init__(self, decoders):
         pass
 
@@ -236,13 +219,11 @@ class Sequence(Decoder):
         """
         pass
 
-
 class Strip(Decoder):
     """
     Strip normalizer
     Strips n left characters of each token, or n right characters of each token
     """
-
     def __init__(self, content, left=0, right=0):
         pass
 
@@ -259,7 +240,6 @@ class Strip(Decoder):
         """
         pass
 
-
 class WordPiece(Decoder):
     """
     WordPiece Decoder
@@ -272,7 +252,6 @@ class WordPiece(Decoder):
             Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
             and some abbreviated english forms.
     """
-
     def __init__(self, prefix="##", cleanup=True):
         pass
 

diff --git a/bindings/python/py_src/tokenizers/models/__init__.py b/bindings/python/py_src/tokenizers/models/__init__.py
@@ -1,8 +0,0 @@
-# Generated content DO NOT EDIT
-from .. import models
-
-Model = models.Model
-BPE = models.BPE
-Unigram = models.Unigram
-WordLevel = models.WordLevel
-WordPiece = models.WordPiece