Skip to content

Commit

Permalink
Copied from for test files (#26713)
Browse files Browse the repository at this point in the history
* copied statement for test files

---------

Co-authored-by: ydshieh <[email protected]>
  • Loading branch information
ydshieh and ydshieh authored Oct 11, 2023
1 parent 9f40639 commit 5334796
Show file tree
Hide file tree
Showing 14 changed files with 127 additions and 45 deletions.
4 changes: 2 additions & 2 deletions tests/models/biogpt/test_modeling_biogpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def test_model_from_pretrained(self):
model = BioGptModel.from_pretrained(model_name)
self.assertIsNotNone(model)

# Copied from tests.models.opt.test_modeling_opt.OPTModelTest with OPT->BioGpt, prepare_config_and_inputs-> prepare_config_and_inputs_for_common
# Copied from tests.models.opt.test_modeling_opt.OPTModelTest.test_opt_sequence_classification_model with OPT->BioGpt,opt->biogpt,prepare_config_and_inputs->prepare_config_and_inputs_for_common
def test_biogpt_sequence_classification_model(self):
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.num_labels = 3
Expand All @@ -399,7 +399,7 @@ def test_biogpt_sequence_classification_model(self):
result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))

# Copied from tests.models.opt.test_modeling_opt.OPTModelTest with OPT->BioGpt, prepare_config_and_inputs-> prepare_config_and_inputs_for_common
# Copied from tests.models.opt.test_modeling_opt.OPTModelTest.test_opt_sequence_classification_model_for_multi_label with OPT->BioGpt,opt->biogpt,prepare_config_and_inputs->prepare_config_and_inputs_for_common
def test_biogpt_sequence_classification_model_for_multi_label(self):
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.num_labels = 3
Expand Down
7 changes: 4 additions & 3 deletions tests/models/clap/test_feature_extraction_clap.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import unittest

import numpy as np
from datasets import load_dataset

from transformers import ClapFeatureExtractor
from transformers.testing_utils import require_torch, require_torchaudio
Expand Down Expand Up @@ -110,10 +111,10 @@ def _flatten(list_of_lists):

@require_torch
@require_torchaudio
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->Clap
class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = ClapFeatureExtractor

# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.setUp with Whisper->Clap
def setUp(self):
self.feat_extract_tester = ClapFeatureExtractionTester(self)

Expand Down Expand Up @@ -147,6 +148,7 @@ def test_call(self):
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))

# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
def test_double_precision_pad(self):
import torch

Expand All @@ -160,9 +162,8 @@ def test_double_precision_pad(self):
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_features.dtype == torch.float32)

# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples
def _load_datasamples(self, num_samples):
from datasets import load_dataset

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
Expand Down
2 changes: 1 addition & 1 deletion tests/models/llama/test_modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def test_llama_sequence_classification_model_for_multi_label(self):
result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))

@unittest.skip("LLaMA buffers include complex numbers, which breaks this test")
@unittest.skip("Llama buffers include complex numbers, which breaks this test")
def test_save_load_fast_init_from_base(self):
pass

Expand Down
12 changes: 11 additions & 1 deletion tests/models/longformer/test_tokenization_longformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from ...test_tokenization_common import TokenizerTesterMixin


# Copied from transformers.tests.roberta.test_modeling_roberta.py with Roberta->Longformer
@require_tokenizers
class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LongformerTokenizer
Expand Down Expand Up @@ -72,19 +71,23 @@ def setUp(self):
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_tokenizer
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_rust_tokenizer
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
return input_text, output_text

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_full_tokenizer
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower newer"
Expand All @@ -96,6 +99,7 @@ def test_full_tokenizer(self):
input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.roberta_dict_integration_testing with roberta->longformer
def longformer_dict_integration_testing(self):
tokenizer = self.get_tokenizer()

Expand All @@ -106,6 +110,7 @@ def longformer_dict_integration_testing(self):
)

@slow
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_sequence_builders with roberta-base->allenai/longformer-base-4096
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("allenai/longformer-base-4096")

Expand All @@ -125,6 +130,7 @@ def test_sequence_builders(self):
assert encoded_sentence == encoded_text_from_decode
assert encoded_pair == encoded_pair_from_decode

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_space_encoding
def test_space_encoding(self):
tokenizer = self.get_tokenizer()

Expand Down Expand Up @@ -165,9 +171,11 @@ def test_space_encoding(self):
first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
self.assertNotEqual(first_char, space_encoding)

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_pretokenized_inputs
def test_pretokenized_inputs(self):
pass

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_embeded_special_tokens
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
Expand Down Expand Up @@ -200,6 +208,7 @@ def test_embeded_special_tokens(self):
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_change_add_prefix_space_and_trim_offsets_args
def test_change_add_prefix_space_and_trim_offsets_args(self):
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
Expand All @@ -214,6 +223,7 @@ def test_change_add_prefix_space_and_trim_offsets_args(self):
self.assertEqual(post_processor_state["add_prefix_space"], add_prefix_space)
self.assertEqual(post_processor_state["trim_offsets"], trim_offsets)

# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments
def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments(self):
# Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` and
# `trim_offsets`
Expand Down
7 changes: 6 additions & 1 deletion tests/models/mistral/test_modeling_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
)


# Copied from transformers.tests.mistral.test_modelling_mistral.MistralModelTest with Llama->Mistral
class MistralModelTester:
def __init__(
self,
Expand Down Expand Up @@ -93,6 +92,7 @@ def __init__(
self.pad_token_id = pad_token_id
self.scope = scope

# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

Expand Down Expand Up @@ -134,6 +134,7 @@ def get_config(self):
pad_token_id=self.pad_token_id,
)

# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Mistral
def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
Expand All @@ -144,6 +145,7 @@ def create_and_check_model(
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))

# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Mistral
def create_and_check_model_as_decoder(
self,
config,
Expand Down Expand Up @@ -174,6 +176,7 @@ def create_and_check_model_as_decoder(
result = model(input_ids, attention_mask=input_mask)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))

# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Mistral
def create_and_check_for_causal_lm(
self,
config,
Expand All @@ -192,6 +195,7 @@ def create_and_check_for_causal_lm(
result = model(input_ids, attention_mask=input_mask, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))

# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Mistral
def create_and_check_decoder_model_past_large_inputs(
self,
config,
Expand Down Expand Up @@ -254,6 +258,7 @@ def create_and_check_decoder_model_past_large_inputs(
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))

# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
Expand Down
21 changes: 20 additions & 1 deletion tests/models/mobilebert/test_tokenization_mobilebert.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english


# Copied from transformers.tests.models.bert.test_modeling_bert.py with Bert->MobileBert and pathfix
@require_tokenizers
class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MobileBertTokenizer
Expand Down Expand Up @@ -71,18 +70,21 @@ def setUp(self):
for tokenizer_def in self.tokenizers_list
]

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00E9d,running"
output_text = "unwanted, running"
return input_text, output_text

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_full_tokenizer
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file)

tokens = tokenizer.tokenize("UNwant\u00E9d,running")
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_rust_and_python_full_tokenizers
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
Expand Down Expand Up @@ -124,11 +126,13 @@ def test_rust_and_python_full_tokenizers(self):
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_chinese
def test_chinese(self):
tokenizer = BasicTokenizer()

self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower
def test_basic_tokenizer_lower(self):
tokenizer = BasicTokenizer(do_lower_case=True)

Expand All @@ -137,6 +141,7 @@ def test_basic_tokenizer_lower(self):
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_false
def test_basic_tokenizer_lower_strip_accents_false(self):
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)

Expand All @@ -145,6 +150,7 @@ def test_basic_tokenizer_lower_strip_accents_false(self):
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_true
def test_basic_tokenizer_lower_strip_accents_true(self):
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)

Expand All @@ -153,6 +159,7 @@ def test_basic_tokenizer_lower_strip_accents_true(self):
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_default
def test_basic_tokenizer_lower_strip_accents_default(self):
tokenizer = BasicTokenizer(do_lower_case=True)

Expand All @@ -161,34 +168,39 @@ def test_basic_tokenizer_lower_strip_accents_default(self):
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_no_lower
def test_basic_tokenizer_no_lower(self):
tokenizer = BasicTokenizer(do_lower_case=False)

self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
)

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_no_lower_strip_accents_false
def test_basic_tokenizer_no_lower_strip_accents_false(self):
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)

self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
)

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_no_lower_strip_accents_true
def test_basic_tokenizer_no_lower_strip_accents_true(self):
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)

self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
)

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_respects_never_split_tokens
def test_basic_tokenizer_respects_never_split_tokens(self):
tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])

self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
)

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_wordpiece_tokenizer
def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]

Expand All @@ -203,6 +215,7 @@ def test_wordpiece_tokenizer(self):

self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_is_whitespace
def test_is_whitespace(self):
self.assertTrue(_is_whitespace(" "))
self.assertTrue(_is_whitespace("\t"))
Expand All @@ -213,6 +226,7 @@ def test_is_whitespace(self):
self.assertFalse(_is_whitespace("A"))
self.assertFalse(_is_whitespace("-"))

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_is_control
def test_is_control(self):
self.assertTrue(_is_control("\u0005"))

Expand All @@ -221,6 +235,7 @@ def test_is_control(self):
self.assertFalse(_is_control("\t"))
self.assertFalse(_is_control("\r"))

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_is_punctuation
def test_is_punctuation(self):
self.assertTrue(_is_punctuation("-"))
self.assertTrue(_is_punctuation("$"))
Expand All @@ -230,6 +245,7 @@ def test_is_punctuation(self):
self.assertFalse(_is_punctuation("A"))
self.assertFalse(_is_punctuation(" "))

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_clean_text
def test_clean_text(self):
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
Expand All @@ -242,6 +258,7 @@ def test_clean_text(self):
)

@slow
# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_sequence_builders with bert-base-uncased->google/mobilebert-uncased
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("google/mobilebert-uncased")

Expand All @@ -254,6 +271,7 @@ def test_sequence_builders(self):
assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102]

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_offsets_with_special_characters
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
Expand Down Expand Up @@ -306,6 +324,7 @@ def test_offsets_with_special_characters(self):
)
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])

# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_change_tokenize_chinese_chars
def test_change_tokenize_chinese_chars(self):
list_of_commun_chinese_char = ["的", "人", "有"]
text_with_chinese_char = "".join(list_of_commun_chinese_char)
Expand Down
Loading

0 comments on commit 5334796

Please sign in to comment.