diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md index 78be813db1a60b..bee8c8a0762044 100644 --- a/docs/source/en/model_doc/codegen.md +++ b/docs/source/en/model_doc/codegen.md @@ -72,6 +72,7 @@ hello_world() ## CodeGenTokenizer [[autodoc]] CodeGenTokenizer + - create_token_type_ids_from_sequences - save_vocabulary ## CodeGenTokenizerFast diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py index abf64e1892250e..1b03af7008465d 100644 --- a/src/transformers/models/codegen/tokenization_codegen.py +++ b/src/transformers/models/codegen/tokenization_codegen.py @@ -134,6 +134,8 @@ class CodeGenTokenizer(PreTrainedTokenizer): other word. (CodeGen tokenizer detect beginning of words by the preceding space). add_bos_token (`bool`, *optional*, defaults to `False`): Whether to add a beginning of sequence token at the start of sequences. + return_token_type_ids (`bool`, *optional*, defaults to `False`): + Whether to return token type IDs. """ vocab_files_names = VOCAB_FILES_NAMES @@ -150,6 +152,7 @@ def __init__( pad_token=None, add_prefix_space=False, add_bos_token=False, + return_token_type_ids=False, **kwargs, ): bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token @@ -157,6 +160,9 @@ def __init__( unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token self.add_bos_token = add_bos_token + self.return_token_type_ids = return_token_type_ids + if self.return_token_type_ids: + self.model_input_names.append("token_type_ids") with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) @@ -181,6 +187,7 @@ def __init__( pad_token=pad_token, add_prefix_space=add_prefix_space, add_bos_token=add_bos_token, + return_token_type_ids=return_token_type_ids, **kwargs, ) @@ -270,6 +277,35 @@ def convert_tokens_to_string(self, tokens): text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) return text + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] if self.sep_token_id is not None else [] + cls = [self.cls_token_id] if self.sep_token_id is not None else [] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py index fb9f0442e03001..b086fb84a65af9 100644 --- a/src/transformers/models/codegen/tokenization_codegen_fast.py +++ b/src/transformers/models/codegen/tokenization_codegen_fast.py @@ -91,6 +91,8 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): add_prefix_space (`bool`, *optional*, defaults to `False`): Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. (CodeGen tokenizer detect beginning of words by the preceding space). + return_token_type_ids (`bool`, *optional*, defaults to `False`): + Whether to return token type IDs. """ vocab_files_names = VOCAB_FILES_NAMES @@ -106,8 +108,13 @@ def __init__( bos_token="<|endoftext|>", eos_token="<|endoftext|>", add_prefix_space=False, + return_token_type_ids=False, **kwargs, ): + self.return_token_type_ids = return_token_type_ids + if self.return_token_type_ids: + self.model_input_names.append("token_type_ids") + super().__init__( vocab_file, merges_file, @@ -116,6 +123,7 @@ def __init__( bos_token=bos_token, eos_token=eos_token, add_prefix_space=add_prefix_space, + return_token_type_ids=return_token_type_ids, **kwargs, ) @@ -157,6 +165,36 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding: return super()._encode_plus(*args, **kwargs) + # Copied from transformers.models.codegen.tokenization_codegen.CodeGenTokenizer.create_token_type_ids_from_sequences + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] if self.sep_token_id is not None else [] + cls = [self.cls_token_id] if self.sep_token_id is not None else [] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py index 025ed99b9aceaf..e7945089c0765d 100644 --- a/tests/models/codegen/test_tokenization_codegen.py +++ b/tests/models/codegen/test_tokenization_codegen.py @@ -264,3 +264,55 @@ def test_truncation(self): # tokenizer has no padding token def test_padding_different_model_input_name(self): pass + + @slow + def test_tokenizer_integration(self): + # Custom test since this tokenizer takes return_token_type_ids as an init argument for backward compatibility. + + sequences = [ + "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides " + "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural " + "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained " + "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.", + "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly " + "conditioning on both left and right context in all layers.", + "The quick brown fox jumps over the lazy dog.", + ] + + tokenizer_classes = [self.tokenizer_class] + if self.test_rust_tokenizer: + tokenizer_classes.append(self.rust_tokenizer_class) + + # Test default case. i.e. return_token_type_ids is False. + for tokenizer_class in tokenizer_classes: + tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono") + + encoding = tokenizer(sequences) + decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]] + + # fmt: off + expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501 + # fmt: on + + encoding_data = encoding.data + self.assertDictEqual(encoding_data, expected_encoding) + + for expected, decoded in zip(sequences, decoded_sequences): + self.assertEqual(expected, decoded) + + # Test return_token_type_ids is True case. + for tokenizer_class in tokenizer_classes: + tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono", return_token_type_ids=True) + + encoding = tokenizer(sequences) + decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]] + + # fmt: off + expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501 + # fmt: on + + encoding_data = encoding.data + self.assertDictEqual(encoding_data, expected_encoding) + + for expected, decoded in zip(sequences, decoded_sequences): + self.assertEqual(expected, decoded)