Skip to content

Commit

Permalink
Fix.
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil committed Mar 11, 2024
1 parent 732d0ca commit 180ee65
Show file tree
Hide file tree
Showing 31 changed files with 40 additions and 271 deletions.
4 changes: 2 additions & 2 deletions bindings/python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ check_dirs := examples py_src/tokenizers tests
style:
python stub.py
ruff check $(check_dirs) --fix
ruff format --line-length 119 --target-version py35 $(check_dirs)
ruff format $(check_dirs)

# Check the source code is formatted correctly
check-style:
python stub.py --check
ruff check examples py_src/tokenizers tests
ruff format --check --line-length 119 --target-version py35 examples py_src/tokenizers tests
ruff format --check examples py_src/tokenizers tests

TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

Expand Down
5 changes: 1 addition & 4 deletions bindings/python/examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
logging.getLogger("transformers.tokenization_utils").disabled = True



parser = argparse.ArgumentParser()
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")
parser.add_argument("--file", default=None, type=str, help="The file to encode")
Expand Down Expand Up @@ -51,9 +50,7 @@
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
""".split(
"\n"
)
""".split("\n")

if args.type == "gpt2":
print("Running GPT-2 tokenizer")
Expand Down
42 changes: 5 additions & 37 deletions bindings/python/py_src/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,7 @@ class AddedToken:
Defines whether this token should be skipped when decoding.
"""

def __init__(
self,
content,
single_word=False,
lstrip=False,
rstrip=False,
normalized=True,
special=False,
):
def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
pass

@property
Expand Down Expand Up @@ -88,12 +79,10 @@ class AddedToken:
"""
pass


class Encoding:
"""
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
"""

@property
def attention_mask(self):
"""
Expand Down Expand Up @@ -206,9 +195,7 @@ class Encoding:
"""
pass

def pad(
self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"
):
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
"""
Pad the :class:`~tokenizers.Encoding` at the given length
Expand Down Expand Up @@ -430,7 +417,6 @@ class Encoding:
"""
pass


class NormalizedString:
"""
NormalizedString
Expand All @@ -443,7 +429,6 @@ class NormalizedString:
sequence: str:
The string sequence used to initialize this NormalizedString
"""

def append(self, s):
"""
Append the given sequence to the string
Expand Down Expand Up @@ -581,7 +566,6 @@ class NormalizedString:
"""
pass


class PreTokenizedString:
"""
PreTokenizedString
Expand All @@ -600,7 +584,6 @@ class PreTokenizedString:
sequence: str:
The string sequence used to initialize this PreTokenizedString
"""

def __init__(self, sequence):
pass

Expand Down Expand Up @@ -681,20 +664,16 @@ class PreTokenizedString:
"""
pass


class Regex:
"""
Instantiate a new Regex with the given pattern
"""

def __init__(self, pattern):
pass


class Token:
pass


class Tokenizer:
"""
A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
Expand All @@ -705,7 +684,6 @@ class Tokenizer:
The core algorithm that this :obj:`Tokenizer` should be using.
"""

def __init__(self, model):
pass

Expand Down Expand Up @@ -789,13 +767,7 @@ class Tokenizer:
pass

def enable_padding(
self,
direction="right",
pad_id=0,
pad_type_id=0,
pad_token="[PAD]",
length=None,
pad_to_multiple_of=None,
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
):
"""
Enable the padding
Expand Down Expand Up @@ -824,9 +796,7 @@ class Tokenizer:
"""
pass

def enable_truncation(
self, max_length, stride=0, strategy="longest_first", direction="right"
):
def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
"""
Enable truncation
Expand All @@ -847,9 +817,7 @@ class Tokenizer:
"""
pass

def encode(
self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True
):
def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
"""
Encode the given sequence and pair. This method can process raw text sequences
as well as already pre-tokenized sequences.
Expand Down
21 changes: 0 additions & 21 deletions bindings/python/py_src/tokenizers/decoders/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ class Decoder:
This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated.
"""

def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Expand All @@ -20,7 +19,6 @@ class Decoder:
"""
pass


class BPEDecoder(Decoder):
"""
BPEDecoder Decoder
Expand All @@ -30,7 +28,6 @@ class BPEDecoder(Decoder):
The suffix that was used to caracterize an end-of-word. This suffix will
be replaced by whitespaces during the decoding
"""

def __init__(self, suffix="</w>"):
pass

Expand All @@ -47,7 +44,6 @@ class BPEDecoder(Decoder):
"""
pass


class ByteFallback(Decoder):
"""
ByteFallback Decoder
Expand All @@ -56,7 +52,6 @@ class ByteFallback(Decoder):
cannot be decoded you will get � instead for each inconvertable byte token
"""

def __init__(self):
pass

Expand All @@ -73,15 +68,13 @@ class ByteFallback(Decoder):
"""
pass


class ByteLevel(Decoder):
"""
ByteLevel Decoder
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
"""

def __init__(self):
pass

Expand All @@ -98,7 +91,6 @@ class ByteLevel(Decoder):
"""
pass


class CTC(Decoder):
"""
CTC Decoder
Expand All @@ -112,7 +104,6 @@ class CTC(Decoder):
Whether to cleanup some tokenization artifacts.
Mainly spaces before punctuation, and some abbreviated english forms.
"""

def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
pass

Expand All @@ -129,15 +120,13 @@ class CTC(Decoder):
"""
pass


class Fuse(Decoder):
"""
Fuse Decoder
Fuse simply fuses every token into a single string.
This is the last step of decoding, this decoder exists only if
there is need to add other decoders *after* the fusion
"""

def __init__(self):
pass

Expand All @@ -154,7 +143,6 @@ class Fuse(Decoder):
"""
pass


class Metaspace(Decoder):
"""
Metaspace Decoder
Expand All @@ -168,7 +156,6 @@ class Metaspace(Decoder):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""

def __init__(self, replacement="▁", add_prefix_space=True):
pass

Expand All @@ -185,15 +172,13 @@ class Metaspace(Decoder):
"""
pass


class Replace(Decoder):
"""
Replace Decoder
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
"""

def __init__(self, pattern, content):
pass

Expand All @@ -210,7 +195,6 @@ class Replace(Decoder):
"""
pass


class Sequence(Decoder):
"""
Sequence Decoder
Expand All @@ -219,7 +203,6 @@ class Sequence(Decoder):
decoders (:obj:`List[Decoder]`)
The decoders that need to be chained
"""

def __init__(self, decoders):
pass

Expand All @@ -236,13 +219,11 @@ class Sequence(Decoder):
"""
pass


class Strip(Decoder):
"""
Strip normalizer
Strips n left characters of each token, or n right characters of each token
"""

def __init__(self, content, left=0, right=0):
pass

Expand All @@ -259,7 +240,6 @@ class Strip(Decoder):
"""
pass


class WordPiece(Decoder):
"""
WordPiece Decoder
Expand All @@ -272,7 +252,6 @@ class WordPiece(Decoder):
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
and some abbreviated english forms.
"""

def __init__(self, prefix="##", cleanup=True):
pass

Expand Down
8 changes: 0 additions & 8 deletions bindings/python/py_src/tokenizers/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +0,0 @@
# Generated content DO NOT EDIT
from .. import models

Model = models.Model
BPE = models.BPE
Unigram = models.Unigram
WordLevel = models.WordLevel
WordPiece = models.WordPiece
Loading

0 comments on commit 180ee65

Please sign in to comment.