From f36232b628613271ae8ea540dedf7371df2a4acf Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 22 Sep 2023 10:46:58 +0000 Subject: [PATCH 01/12] add audio_utils usage in the FE of SpeechToText --- .../feature_extraction_speech_to_text.py | 45 +++++++++++++++++-- .../test_feature_extraction_speech_to_text.py | 37 +++++++++++++++ 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index 81f2ea4e99be22..c338f548382bb5 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -25,7 +25,7 @@ from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...utils import PaddingStrategy, TensorType, logging - +from ...audio_utils import mel_filter_bank, spectrogram, window_function logger = logging.get_logger(__name__) @@ -55,6 +55,8 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor): Whether or not to zero-mean normalize the extracted features. normalize_vars (`bool`, *optional*, defaults to `True`): Whether or not to unit-variance normalize the extracted features. + use_torchaudio (`bool`, *optional*, defaults to `True`): + Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of torchaudio mel-filter banks implementation. """ model_input_names = ["input_features", "attention_mask"] @@ -68,6 +70,7 @@ def __init__( do_ceptral_normalize=True, normalize_means=True, normalize_vars=True, + use_torchaudio=True, **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) @@ -76,6 +79,22 @@ def __init__( self.normalize_means = normalize_means self.normalize_vars = normalize_vars self.return_attention_mask = True + + self.use_torchaudio = use_torchaudio + if not use_torchaudio: + mel_filters = mel_filter_bank( + num_frequency_bins=256, + num_mel_filters=self.num_mel_bins, + min_frequency=20, + max_frequency=sampling_rate//2, + sampling_rate=sampling_rate, + norm=None, + mel_scale="kaldi", + triangularize_in_mel_space=True, + ) + + self.mel_filters = np.pad(mel_filters, ((0,1), (0,0))) + self.window = window_function(400, "povey", periodic=False) def _extract_fbank_features( self, @@ -86,9 +105,27 @@ def _extract_fbank_features( and hence the waveform should not be normalized before feature extraction. """ waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers - waveform = torch.from_numpy(waveform).unsqueeze(0) - features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate) - return features.numpy() + if self.use_torchaudio: + waveform = torch.from_numpy(waveform).unsqueeze(0) + features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate) + features = features.numpy() + else: + waveform = np.squeeze(waveform) + features = spectrogram( + waveform, + self.window, + frame_length=400, + hop_length=160, + fft_length=512, + power=2.0, + center=False, + preemphasis=0.97, + mel_filters=self.mel_filters, + log_mel="log", + mel_floor=1.192092955078125e-07, + remove_dc_offset=True, + ).T + return features @staticmethod def utterance_cmvn( diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 293b33fde80e3a..f52857ba376cd7 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -144,6 +144,37 @@ def test_call(self): for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + def test_call_audio_utils(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False) + # create three inputs of length 800, 1000, and 1200 + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] + + # Test feature size + input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features + self.assertTrue(input_features.ndim == 3) + self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size) + + # Test not batched input + encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features + self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) + + # Test batched + encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + # Test 2-D numpy arrays are batched. + speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)] + np_speech_inputs = np.asarray(speech_inputs) + encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + def test_cepstral_mean_and_variance_normalization(self): feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] @@ -279,3 +310,9 @@ def test_integration(self): input_features = feature_extractor(input_speech, return_tensors="pt").input_features self.assertEquals(input_features.shape, (1, 584, 24)) self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) + + # test audio_utils implementation + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False) + input_features = feature_extractor(input_speech, return_tensors="pt").input_features + self.assertEquals(input_features.shape, (1, 584, 24)) + self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) From fbc40d32a03a961b27347070a829415b63ac61b7 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 22 Sep 2023 10:51:17 +0000 Subject: [PATCH 02/12] clean unecessary parameters of AudioSpectrogramTransformer FE --- .../feature_extraction_audio_spectrogram_transformer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index 786548fd2336e9..feb24d6c6a0fe0 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -96,13 +96,9 @@ def _extract_fbank_features( waveform = torch.from_numpy(waveform).unsqueeze(0) fbank = ta_kaldi.fbank( waveform, - htk_compat=True, sample_frequency=self.sampling_rate, - use_energy=False, window_type="hanning", num_mel_bins=self.num_mel_bins, - dither=0.0, - frame_shift=10, ) n_frames = fbank.shape[0] From 5ad48e0f3991b7a395798da89283eee9d688adcd Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 22 Sep 2023 10:58:40 +0000 Subject: [PATCH 03/12] add audio_utils usage in AST --- ...xtraction_audio_spectrogram_transformer.py | 53 ++++++++++++++++--- ...xtraction_audio_spectrogram_transformer.py | 33 ++++++++++++ 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index feb24d6c6a0fe0..0d832d4e16f5ba 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -25,6 +25,7 @@ from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...utils import TensorType, logging +from ...audio_utils import mel_filter_bank, spectrogram, window_function logger = logging.get_logger(__name__) @@ -58,6 +59,8 @@ class ASTFeatureExtractor(SequenceFeatureExtractor): by default. return_attention_mask (`bool`, *optional*, defaults to `False`): Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. + use_torchaudio (`bool`, *optional*, defaults to `True`): + Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of torchaudio mel-filter banks implementation. """ model_input_names = ["input_values", "attention_mask"] @@ -73,6 +76,7 @@ def __init__( mean=-4.2677393, std=4.5689974, return_attention_mask=False, + use_torchaudio=True, **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) @@ -82,6 +86,22 @@ def __init__( self.mean = mean self.std = std self.return_attention_mask = return_attention_mask + + self.use_torchaudio = use_torchaudio + if not use_torchaudio: + mel_filters = mel_filter_bank( + num_frequency_bins=256, + num_mel_filters=self.num_mel_bins, + min_frequency=20, + max_frequency=sampling_rate//2, + sampling_rate=sampling_rate, + norm=None, + mel_scale="kaldi", + triangularize_in_mel_space=True, + ) + + self.mel_filters = np.pad(mel_filters, ((0,1), (0,0))) + self.window = window_function(400, "hann", periodic=False) def _extract_fbank_features( self, @@ -93,13 +113,32 @@ def _extract_fbank_features( and hence the waveform should not be normalized before feature extraction. """ # waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers - waveform = torch.from_numpy(waveform).unsqueeze(0) - fbank = ta_kaldi.fbank( - waveform, - sample_frequency=self.sampling_rate, - window_type="hanning", - num_mel_bins=self.num_mel_bins, - ) + if self.use_torchaudio: + waveform = torch.from_numpy(waveform).unsqueeze(0) + fbank = ta_kaldi.fbank( + waveform, + sample_frequency=self.sampling_rate, + window_type="hanning", + num_mel_bins=self.num_mel_bins, + ) + else: + waveform = np.squeeze(waveform) + fbank = spectrogram( + waveform, + self.window, + frame_length=400, + hop_length=160, + fft_length=512, + power=2.0, + center=False, + preemphasis=0.97, + mel_filters=self.mel_filters, + log_mel="log", + mel_floor=1.192092955078125e-07, + remove_dc_offset=True, + ).T + + fbank = torch.from_numpy(fbank) n_frames = fbank.shape[0] difference = max_length - n_frames diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 69a1bddc825080..34966b73b54d1d 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -132,6 +132,33 @@ def test_call(self): encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + + def test_call_audio_utils(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False) + # create three inputs of length 800, 1000, and 1200 + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] + + # Test not batched input + encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values + self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) + + # Test batched + encoded_sequences_1 = feat_extract(speech_inputs, padding=True, return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_speech_inputs, padding=True, return_tensors="np").input_values + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + # Test 2-D numpy arrays are batched. + speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)] + np_speech_inputs = np.asarray(speech_inputs) + encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) @require_torch def test_double_precision_pad(self): @@ -172,3 +199,9 @@ def test_integration(self): input_values = feature_extractor(input_speech, return_tensors="pt").input_values self.assertEquals(input_values.shape, (1, 1024, 128)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) + + # test audio_utils implementation + feature_extractor = ASTFeatureExtractor(use_torchaudio=False) + input_values = feature_extractor(input_speech, return_tensors="pt").input_values + self.assertEquals(input_values.shape, (1, 1024, 128)) + self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) From 73a4c06394c7ef4a68196acf7ee9f075ce1f61b5 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 22 Sep 2023 11:05:53 +0000 Subject: [PATCH 04/12] add serialization tests and function to FEs --- ...xtraction_audio_spectrogram_transformer.py | 15 ++++++ .../feature_extraction_speech_to_text.py | 15 ++++++ ...xtraction_audio_spectrogram_transformer.py | 54 ++++++++++++++++++- .../test_feature_extraction_speech_to_text.py | 54 ++++++++++++++++++- 4 files changed, 136 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index 0d832d4e16f5ba..ac915e94823b36 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -17,6 +17,7 @@ """ from typing import List, Optional, Union +import copy import numpy as np import torch @@ -233,3 +234,17 @@ def __call__( padded_inputs = padded_inputs.convert_to_tensors(return_tensors) return padded_inputs + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. + """ + output = copy.deepcopy(self.__dict__) + output["feature_extractor_type"] = self.__class__.__name__ + if "mel_filters" in output: + del output["mel_filters"] + if "window" in output: + del output["window"] + return output \ No newline at end of file diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index c338f548382bb5..db9314bbcdada7 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -17,6 +17,7 @@ """ from typing import List, Optional, Union +import copy import numpy as np import torch @@ -296,3 +297,17 @@ def __call__( padded_inputs = padded_inputs.convert_to_tensors(return_tensors) return padded_inputs + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. + """ + output = copy.deepcopy(self.__dict__) + output["feature_extractor_type"] = self.__class__.__name__ + if "mel_filters" in output: + del output["mel_filters"] + if "window" in output: + del output["window"] + return output \ No newline at end of file diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 34966b73b54d1d..ee9f0fc249f86b 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -17,11 +17,14 @@ import itertools import random import unittest +import os +import tempfile +import copy import numpy as np from transformers import ASTFeatureExtractor -from transformers.testing_utils import require_torch, require_torchaudio +from transformers.testing_utils import require_torch, require_torchaudio, check_json_file_has_correct_format from transformers.utils.import_utils import is_torch_available from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -205,3 +208,52 @@ def test_integration(self): input_values = feature_extractor(input_speech, return_tensors="pt").input_values self.assertEquals(input_values.shape, (1, 1024, 128)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) + + def test_feat_extract_from_and_save_pretrained(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertDictEqual(dict_first, dict_second) + + + # test audio_utils implementation + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertDictEqual(dict_first, dict_second) + + def test_feat_extract_to_json_file(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertEqual(dict_first, dict_second) + + # test audio_utils implementation + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertEqual(dict_first, dict_second) \ No newline at end of file diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index f52857ba376cd7..3cbfdf20d7d35b 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -17,11 +17,14 @@ import itertools import random import unittest +import os +import tempfile +import copy import numpy as np from transformers import is_speech_available -from transformers.testing_utils import require_torch, require_torchaudio +from transformers.testing_utils import require_torch, require_torchaudio, check_json_file_has_correct_format from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -316,3 +319,52 @@ def test_integration(self): input_features = feature_extractor(input_speech, return_tensors="pt").input_features self.assertEquals(input_features.shape, (1, 584, 24)) self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) + + def test_feat_extract_from_and_save_pretrained(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertDictEqual(dict_first, dict_second) + + + # test audio_utils implementation + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertDictEqual(dict_first, dict_second) + + def test_feat_extract_to_json_file(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertEqual(dict_first, dict_second) + + # test audio_utils implementation + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertEqual(dict_first, dict_second) \ No newline at end of file From f06db425d7d87c15d3f21d02cb67d6c74bd3868e Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 22 Sep 2023 11:07:10 +0000 Subject: [PATCH 05/12] make style --- ...xtraction_audio_spectrogram_transformer.py | 20 ++++++++--------- .../feature_extraction_speech_to_text.py | 19 ++++++++-------- ...xtraction_audio_spectrogram_transformer.py | 21 +++++++++--------- .../test_feature_extraction_speech_to_text.py | 22 ++++++++++--------- 4 files changed, 42 insertions(+), 40 deletions(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index ac915e94823b36..4ee918aec21cd6 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -16,17 +16,17 @@ Feature extractor class for Audio Spectrogram Transformer. """ -from typing import List, Optional, Union import copy +from typing import List, Optional, Union import numpy as np import torch import torchaudio.compliance.kaldi as ta_kaldi +from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...utils import TensorType, logging -from ...audio_utils import mel_filter_bank, spectrogram, window_function logger = logging.get_logger(__name__) @@ -61,7 +61,8 @@ class ASTFeatureExtractor(SequenceFeatureExtractor): return_attention_mask (`bool`, *optional*, defaults to `False`): Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. use_torchaudio (`bool`, *optional*, defaults to `True`): - Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of torchaudio mel-filter banks implementation. + Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of + torchaudio mel-filter banks implementation. """ model_input_names = ["input_values", "attention_mask"] @@ -87,21 +88,21 @@ def __init__( self.mean = mean self.std = std self.return_attention_mask = return_attention_mask - + self.use_torchaudio = use_torchaudio if not use_torchaudio: mel_filters = mel_filter_bank( num_frequency_bins=256, num_mel_filters=self.num_mel_bins, min_frequency=20, - max_frequency=sampling_rate//2, + max_frequency=sampling_rate // 2, sampling_rate=sampling_rate, norm=None, mel_scale="kaldi", triangularize_in_mel_space=True, ) - self.mel_filters = np.pad(mel_filters, ((0,1), (0,0))) + self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0))) self.window = window_function(400, "hann", periodic=False) def _extract_fbank_features( @@ -138,7 +139,7 @@ def _extract_fbank_features( mel_floor=1.192092955078125e-07, remove_dc_offset=True, ).T - + fbank = torch.from_numpy(fbank) n_frames = fbank.shape[0] @@ -237,8 +238,7 @@ def __call__( def to_dict(self): """ - Serializes this instance to a Python dictionary. - Returns: + Serializes this instance to a Python dictionary. Returns: `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. """ output = copy.deepcopy(self.__dict__) @@ -247,4 +247,4 @@ def to_dict(self): del output["mel_filters"] if "window" in output: del output["window"] - return output \ No newline at end of file + return output diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index db9314bbcdada7..13447e52ba3fdc 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -16,17 +16,18 @@ Feature extractor class for Speech2Text """ -from typing import List, Optional, Union import copy +from typing import List, Optional, Union import numpy as np import torch import torchaudio.compliance.kaldi as ta_kaldi +from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...utils import PaddingStrategy, TensorType, logging -from ...audio_utils import mel_filter_bank, spectrogram, window_function + logger = logging.get_logger(__name__) @@ -57,7 +58,8 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor): normalize_vars (`bool`, *optional*, defaults to `True`): Whether or not to unit-variance normalize the extracted features. use_torchaudio (`bool`, *optional*, defaults to `True`): - Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of torchaudio mel-filter banks implementation. + Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of + torchaudio mel-filter banks implementation. """ model_input_names = ["input_features", "attention_mask"] @@ -80,21 +82,21 @@ def __init__( self.normalize_means = normalize_means self.normalize_vars = normalize_vars self.return_attention_mask = True - + self.use_torchaudio = use_torchaudio if not use_torchaudio: mel_filters = mel_filter_bank( num_frequency_bins=256, num_mel_filters=self.num_mel_bins, min_frequency=20, - max_frequency=sampling_rate//2, + max_frequency=sampling_rate // 2, sampling_rate=sampling_rate, norm=None, mel_scale="kaldi", triangularize_in_mel_space=True, ) - self.mel_filters = np.pad(mel_filters, ((0,1), (0,0))) + self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0))) self.window = window_function(400, "povey", periodic=False) def _extract_fbank_features( @@ -300,8 +302,7 @@ def __call__( def to_dict(self): """ - Serializes this instance to a Python dictionary. - Returns: + Serializes this instance to a Python dictionary. Returns: `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. """ output = copy.deepcopy(self.__dict__) @@ -310,4 +311,4 @@ def to_dict(self): del output["mel_filters"] if "window" in output: del output["window"] - return output \ No newline at end of file + return output diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index ee9f0fc249f86b..c49c3ecde32204 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -15,16 +15,15 @@ import itertools -import random -import unittest import os +import random import tempfile -import copy +import unittest import numpy as np from transformers import ASTFeatureExtractor -from transformers.testing_utils import require_torch, require_torchaudio, check_json_file_has_correct_format +from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio from transformers.utils.import_utils import is_torch_available from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -135,11 +134,12 @@ def test_call(self): encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) - - + def test_call_audio_utils(self): # Tests that all call wrap to encode_plus and batch_encode_plus - feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False) + feat_extract = self.feature_extraction_class( + **self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False + ) # create three inputs of length 800, 1000, and 1200 speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] @@ -220,8 +220,7 @@ def test_feat_extract_from_and_save_pretrained(self): dict_first = feat_extract_first.to_dict() dict_second = feat_extract_second.to_dict() self.assertDictEqual(dict_first, dict_second) - - + # test audio_utils implementation feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) @@ -245,7 +244,7 @@ def test_feat_extract_to_json_file(self): dict_first = feat_extract_first.to_dict() dict_second = feat_extract_second.to_dict() self.assertEqual(dict_first, dict_second) - + # test audio_utils implementation feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) @@ -256,4 +255,4 @@ def test_feat_extract_to_json_file(self): dict_first = feat_extract_first.to_dict() dict_second = feat_extract_second.to_dict() - self.assertEqual(dict_first, dict_second) \ No newline at end of file + self.assertEqual(dict_first, dict_second) diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 3cbfdf20d7d35b..6bcabdb8c16455 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -15,16 +15,15 @@ import itertools -import random -import unittest import os +import random import tempfile -import copy +import unittest import numpy as np from transformers import is_speech_available -from transformers.testing_utils import require_torch, require_torchaudio, check_json_file_has_correct_format +from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -149,7 +148,9 @@ def test_call(self): def test_call_audio_utils(self): # Tests that all call wrap to encode_plus and batch_encode_plus - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False) + feature_extractor = self.feature_extraction_class( + **self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False + ) # create three inputs of length 800, 1000, and 1200 speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] @@ -315,7 +316,9 @@ def test_integration(self): self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) # test audio_utils implementation - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False) + feature_extractor = self.feature_extraction_class( + **self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False + ) input_features = feature_extractor(input_speech, return_tensors="pt").input_features self.assertEquals(input_features.shape, (1, 584, 24)) self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) @@ -331,8 +334,7 @@ def test_feat_extract_from_and_save_pretrained(self): dict_first = feat_extract_first.to_dict() dict_second = feat_extract_second.to_dict() self.assertDictEqual(dict_first, dict_second) - - + # test audio_utils implementation feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) @@ -356,7 +358,7 @@ def test_feat_extract_to_json_file(self): dict_first = feat_extract_first.to_dict() dict_second = feat_extract_second.to_dict() self.assertEqual(dict_first, dict_second) - + # test audio_utils implementation feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) @@ -367,4 +369,4 @@ def test_feat_extract_to_json_file(self): dict_first = feat_extract_first.to_dict() dict_second = feat_extract_second.to_dict() - self.assertEqual(dict_first, dict_second) \ No newline at end of file + self.assertEqual(dict_first, dict_second) From 608644b49ccee15c397da4f678ef67b570499bc3 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Thu, 28 Sep 2023 16:05:04 +0000 Subject: [PATCH 06/12] remove use_torchaudio and move to_dict to FE --- src/transformers/feature_extraction_utils.py | 11 ++++---- ...xtraction_audio_spectrogram_transformer.py | 27 ++++--------------- .../feature_extraction_speech_to_text.py | 27 ++++--------------- 3 files changed, 16 insertions(+), 49 deletions(-) diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index 838827f8c5c2f7..646460710e0b81 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -574,14 +574,15 @@ def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrain def to_dict(self) -> Dict[str, Any]: """ - Serializes this instance to a Python dictionary. - - Returns: - `Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance. + Serializes this instance to a Python dictionary. Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. """ output = copy.deepcopy(self.__dict__) output["feature_extractor_type"] = self.__class__.__name__ - + if "mel_filters" in output: + del output["mel_filters"] + if "window" in output: + del output["window"] return output @classmethod diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index 4ee918aec21cd6..c348a277464244 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -16,7 +16,6 @@ Feature extractor class for Audio Spectrogram Transformer. """ -import copy from typing import List, Optional, Union import numpy as np @@ -26,7 +25,7 @@ from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature -from ...utils import TensorType, logging +from ...utils import TensorType, is_speech_available, logging logger = logging.get_logger(__name__) @@ -39,8 +38,8 @@ class ASTFeatureExtractor(SequenceFeatureExtractor): This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. - This class extracts mel-filter bank features from raw speech using TorchAudio, pads/truncates them to a fixed - length and normalizes them using a mean and standard deviation. + This class extracts mel-filter bank features from raw speech using TorchAudio if installed or using numpy + otherwise, pads/truncates them to a fixed length and normalizes them using a mean and standard deviation. Args: feature_size (`int`, *optional*, defaults to 1): @@ -60,9 +59,6 @@ class ASTFeatureExtractor(SequenceFeatureExtractor): by default. return_attention_mask (`bool`, *optional*, defaults to `False`): Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. - use_torchaudio (`bool`, *optional*, defaults to `True`): - Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of - torchaudio mel-filter banks implementation. """ model_input_names = ["input_values", "attention_mask"] @@ -90,7 +86,7 @@ def __init__( self.return_attention_mask = return_attention_mask self.use_torchaudio = use_torchaudio - if not use_torchaudio: + if not is_speech_available(): mel_filters = mel_filter_bank( num_frequency_bins=256, num_mel_filters=self.num_mel_bins, @@ -115,7 +111,7 @@ def _extract_fbank_features( and hence the waveform should not be normalized before feature extraction. """ # waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers - if self.use_torchaudio: + if is_speech_available(): waveform = torch.from_numpy(waveform).unsqueeze(0) fbank = ta_kaldi.fbank( waveform, @@ -235,16 +231,3 @@ def __call__( padded_inputs = padded_inputs.convert_to_tensors(return_tensors) return padded_inputs - - def to_dict(self): - """ - Serializes this instance to a Python dictionary. Returns: - `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. - """ - output = copy.deepcopy(self.__dict__) - output["feature_extractor_type"] = self.__class__.__name__ - if "mel_filters" in output: - del output["mel_filters"] - if "window" in output: - del output["window"] - return output diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index 13447e52ba3fdc..d3f8ef93811417 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -16,7 +16,6 @@ Feature extractor class for Speech2Text """ -import copy from typing import List, Optional, Union import numpy as np @@ -26,7 +25,7 @@ from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature -from ...utils import PaddingStrategy, TensorType, logging +from ...utils import PaddingStrategy, TensorType, is_speech_available, logging logger = logging.get_logger(__name__) @@ -39,8 +38,8 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor): This feature extractor inherits from [`Speech2TextFeatureExtractor`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. - This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral - mean and variance normalization to the extracted features. + This class extracts mel-filter bank features from raw speech using TorchAudio if installed or using numpy + otherwise, and applies utterance-level cepstral mean and variance normalization to the extracted features. Args: feature_size (`int`, defaults to 80): @@ -57,9 +56,6 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor): Whether or not to zero-mean normalize the extracted features. normalize_vars (`bool`, *optional*, defaults to `True`): Whether or not to unit-variance normalize the extracted features. - use_torchaudio (`bool`, *optional*, defaults to `True`): - Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of - torchaudio mel-filter banks implementation. """ model_input_names = ["input_features", "attention_mask"] @@ -84,7 +80,7 @@ def __init__( self.return_attention_mask = True self.use_torchaudio = use_torchaudio - if not use_torchaudio: + if not is_speech_available(): mel_filters = mel_filter_bank( num_frequency_bins=256, num_mel_filters=self.num_mel_bins, @@ -108,7 +104,7 @@ def _extract_fbank_features( and hence the waveform should not be normalized before feature extraction. """ waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers - if self.use_torchaudio: + if is_speech_available(): waveform = torch.from_numpy(waveform).unsqueeze(0) features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate) features = features.numpy() @@ -299,16 +295,3 @@ def __call__( padded_inputs = padded_inputs.convert_to_tensors(return_tensors) return padded_inputs - - def to_dict(self): - """ - Serializes this instance to a Python dictionary. Returns: - `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. - """ - output = copy.deepcopy(self.__dict__) - output["feature_extractor_type"] = self.__class__.__name__ - if "mel_filters" in output: - del output["mel_filters"] - if "window" in output: - del output["window"] - return output From 605ed144d6a7b0a4283a6bf750ea70e1e9f491f4 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 29 Sep 2023 11:03:46 +0000 Subject: [PATCH 07/12] test audio_utils usage --- ...xtraction_audio_spectrogram_transformer.py | 5 +- .../feature_extraction_speech_to_text.py | 2 - ...xtraction_audio_spectrogram_transformer.py | 115 ++++++--- .../test_feature_extraction_speech_to_text.py | 224 ++++++++++++++---- .../test_processor_speech_to_text.py | 7 +- 5 files changed, 268 insertions(+), 85 deletions(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index c348a277464244..e1b299cb2e10c1 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -20,13 +20,14 @@ import numpy as np import torch -import torchaudio.compliance.kaldi as ta_kaldi from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature from ...utils import TensorType, is_speech_available, logging +if is_speech_available(): + import torchaudio.compliance.kaldi as ta_kaldi logger = logging.get_logger(__name__) @@ -74,7 +75,6 @@ def __init__( mean=-4.2677393, std=4.5689974, return_attention_mask=False, - use_torchaudio=True, **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) @@ -85,7 +85,6 @@ def __init__( self.std = std self.return_attention_mask = return_attention_mask - self.use_torchaudio = use_torchaudio if not is_speech_available(): mel_filters = mel_filter_bank( num_frequency_bins=256, diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index d3f8ef93811417..7f8943875d4ee1 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -69,7 +69,6 @@ def __init__( do_ceptral_normalize=True, normalize_means=True, normalize_vars=True, - use_torchaudio=True, **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) @@ -79,7 +78,6 @@ def __init__( self.normalize_vars = normalize_vars self.return_attention_mask = True - self.use_torchaudio = use_torchaudio if not is_speech_available(): mel_filters = mel_filter_bank( num_frequency_bins=256, diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index c49c3ecde32204..e81c3614203363 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -134,12 +134,89 @@ def test_call(self): encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + @require_torch + def test_double_precision_pad(self): + import torch - def test_call_audio_utils(self): - # Tests that all call wrap to encode_plus and batch_encode_plus - feat_extract = self.feature_extraction_class( - **self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + np_speech_inputs = np.random.rand(100).astype(np.float64) + py_speech_inputs = np_speech_inputs.tolist() + + for inputs in [py_speech_inputs, np_speech_inputs]: + np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np") + self.assertTrue(np_processed.input_values.dtype == np.float32) + pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt") + self.assertTrue(pt_processed.input_values.dtype == torch.float32) + + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + @require_torch + def test_integration(self): + # fmt: off + EXPECTED_INPUT_VALUES = torch.tensor( + [-0.9894, -1.2776, -0.9066, -1.2776, -0.9349, -1.2609, -1.0386, -1.2776, + -1.1561, -1.2776, -1.2052, -1.2723, -1.2190, -1.2132, -1.2776, -1.1133, + -1.1953, -1.1343, -1.1584, -1.2203, -1.1770, -1.2474, -1.2381, -1.1936, + -0.9270, -0.8317, -0.8049, -0.7706, -0.7565, -0.7869] ) + # fmt: on + + input_speech = self._load_datasamples(1) + feature_extractor = ASTFeatureExtractor() + input_values = feature_extractor(input_speech, return_tensors="pt").input_values + self.assertEquals(input_values.shape, (1, 1024, 128)) + self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) + + def test_feat_extract_from_and_save_pretrained(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertDictEqual(dict_first, dict_second) + + def test_feat_extract_to_json_file(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertEqual(dict_first, dict_second) + + +@require_torch +@unittest.mock.patch("transformers.models.audio_spectrogram_transformer.feature_extraction_audio_spectrogram_transformer.is_speech_available", lambda: False) +class ASTFeatureExtractionWithoutTorchaudioTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): + feature_extraction_class = ASTFeatureExtractor + def setUp(self): + self.feat_extract_tester = ASTFeatureExtractionTester(self) + + def test_using_audio_utils(self): + # Tests that it uses audio_utils instead of torchaudio + feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + + self.assertTrue(hasattr(feat_extract, "window")) + self.assertTrue(hasattr(feat_extract, "mel_filters")) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) # create three inputs of length 800, 1000, and 1200 speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] @@ -203,12 +280,6 @@ def test_integration(self): self.assertEquals(input_values.shape, (1, 1024, 128)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) - # test audio_utils implementation - feature_extractor = ASTFeatureExtractor(use_torchaudio=False) - input_values = feature_extractor(input_speech, return_tensors="pt").input_values - self.assertEquals(input_values.shape, (1, 1024, 128)) - self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) - def test_feat_extract_from_and_save_pretrained(self): feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) @@ -221,18 +292,6 @@ def test_feat_extract_from_and_save_pretrained(self): dict_second = feat_extract_second.to_dict() self.assertDictEqual(dict_first, dict_second) - # test audio_utils implementation - feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) - - with tempfile.TemporaryDirectory() as tmpdirname: - saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] - check_json_file_has_correct_format(saved_file) - feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) - - dict_first = feat_extract_first.to_dict() - dict_second = feat_extract_second.to_dict() - self.assertDictEqual(dict_first, dict_second) - def test_feat_extract_to_json_file(self): feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) @@ -244,15 +303,3 @@ def test_feat_extract_to_json_file(self): dict_first = feat_extract_first.to_dict() dict_second = feat_extract_second.to_dict() self.assertEqual(dict_first, dict_second) - - # test audio_utils implementation - feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) - - with tempfile.TemporaryDirectory() as tmpdirname: - json_file_path = os.path.join(tmpdirname, "feat_extract.json") - feat_extract_first.to_json_file(json_file_path) - feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) - - dict_first = feat_extract_first.to_dict() - dict_second = feat_extract_second.to_dict() - self.assertEqual(dict_first, dict_second) diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 6bcabdb8c16455..e2dd3146c4fb92 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -22,14 +22,12 @@ import numpy as np -from transformers import is_speech_available from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin -if is_speech_available(): - from transformers import Speech2TextFeatureExtractor +from transformers import Speech2TextFeatureExtractor global_rng = random.Random() @@ -106,7 +104,7 @@ def _flatten(list_of_lists): @require_torch @require_torchaudio class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = Speech2TextFeatureExtractor if is_speech_available() else None + feature_extraction_class = Speech2TextFeatureExtractor def setUp(self): self.feat_extract_tester = Speech2TextFeatureExtractionTester(self) @@ -146,11 +144,189 @@ def test_call(self): for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) - def test_call_audio_utils(self): - # Tests that all call wrap to encode_plus and batch_encode_plus - feature_extractor = self.feature_extraction_class( - **self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False + def test_cepstral_mean_and_variance_normalization(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + + paddings = ["longest", "max_length", "do_not_pad"] + max_lengths = [None, 16, None] + for max_length, padding in zip(max_lengths, paddings): + inputs = feature_extractor( + speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True + ) + input_features = inputs.input_features + attention_mask = inputs.attention_mask + fbank_feat_lengths = [np.sum(x) for x in attention_mask] + + self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]]) + self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]]) + self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]]) + + def test_cepstral_mean_and_variance_normalization_np(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + + paddings = ["longest", "max_length", "do_not_pad"] + max_lengths = [None, 16, None] + for max_length, padding in zip(max_lengths, paddings): + inputs = feature_extractor( + speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True + ) + input_features = inputs.input_features + attention_mask = inputs.attention_mask + fbank_feat_lengths = [np.sum(x) for x in attention_mask] + + self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]]) + self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6) + self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]]) + self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6) + self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]]) + + def test_cepstral_mean_and_variance_normalization_trunc_max_length(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + inputs = feature_extractor( + speech_inputs, + padding="max_length", + max_length=4, + truncation=True, + return_tensors="np", + return_attention_mask=True, + ) + input_features = inputs.input_features + attention_mask = inputs.attention_mask + fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) + + self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) + self._check_zero_mean_unit_variance(input_features[1]) + self._check_zero_mean_unit_variance(input_features[2]) + + def test_cepstral_mean_and_variance_normalization_trunc_longest(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + inputs = feature_extractor( + speech_inputs, + padding="longest", + max_length=4, + truncation=True, + return_tensors="np", + return_attention_mask=True, + ) + input_features = inputs.input_features + attention_mask = inputs.attention_mask + fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) + + self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) + self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]]) + self._check_zero_mean_unit_variance(input_features[2]) + + # make sure that if max_length < longest -> then pad to max_length + self.assertEqual(input_features.shape, (3, 4, 24)) + + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + inputs = feature_extractor( + speech_inputs, + padding="longest", + max_length=16, + truncation=True, + return_tensors="np", + return_attention_mask=True, ) + input_features = inputs.input_features + attention_mask = inputs.attention_mask + fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) + + self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) + self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]]) + self._check_zero_mean_unit_variance(input_features[2]) + + # make sure that if max_length < longest -> then pad to max_length + self.assertEqual(input_features.shape, (3, 6, 24)) + + def test_double_precision_pad(self): + import torch + + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + np_speech_inputs = np.random.rand(100, 32).astype(np.float64) + py_speech_inputs = np_speech_inputs.tolist() + + for inputs in [py_speech_inputs, np_speech_inputs]: + np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np") + self.assertTrue(np_processed.input_features.dtype == np.float32) + pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt") + self.assertTrue(pt_processed.input_features.dtype == torch.float32) + + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + def test_integration(self): + # fmt: off + expected = np.array([ + -1.5745, -1.7713, -1.7020, -1.6069, -1.2250, -1.1105, -0.9072, -0.8241, + -1.2310, -0.8098, -0.3320, -0.4101, -0.7985, -0.4996, -0.8213, -0.9128, + -1.0420, -1.1286, -1.0440, -0.7999, -0.8405, -1.2275, -1.5443, -1.4625, + ]) + # fmt: on + + input_speech = self._load_datasamples(1) + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + input_features = feature_extractor(input_speech, return_tensors="pt").input_features + self.assertEquals(input_features.shape, (1, 584, 24)) + self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) + + def test_feat_extract_from_and_save_pretrained(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertDictEqual(dict_first, dict_second) + + def test_feat_extract_to_json_file(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + self.assertEqual(dict_first, dict_second) + + +@require_torch +@unittest.mock.patch("transformers.models.speech_to_text.feature_extraction_speech_to_text.is_speech_available", lambda: False) +class Speech2TextFeatureExtractionWithoutTorchaudioTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): + feature_extraction_class = Speech2TextFeatureExtractor + + def setUp(self): + self.feat_extract_tester = Speech2TextFeatureExtractionTester(self) + + def _check_zero_mean_unit_variance(self, input_vector): + self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) + self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3)) + + def test_using_audio_utils(self): + # Tests that it uses audio_utils instead of torchaudio + feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + + self.assertTrue(hasattr(feat_extract, "window")) + self.assertTrue(hasattr(feat_extract, "mel_filters")) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) # create three inputs of length 800, 1000, and 1200 speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] @@ -315,14 +491,6 @@ def test_integration(self): self.assertEquals(input_features.shape, (1, 584, 24)) self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) - # test audio_utils implementation - feature_extractor = self.feature_extraction_class( - **self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False - ) - input_features = feature_extractor(input_speech, return_tensors="pt").input_features - self.assertEquals(input_features.shape, (1, 584, 24)) - self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) - def test_feat_extract_from_and_save_pretrained(self): feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) @@ -335,18 +503,6 @@ def test_feat_extract_from_and_save_pretrained(self): dict_second = feat_extract_second.to_dict() self.assertDictEqual(dict_first, dict_second) - # test audio_utils implementation - feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) - - with tempfile.TemporaryDirectory() as tmpdirname: - saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] - check_json_file_has_correct_format(saved_file) - feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) - - dict_first = feat_extract_first.to_dict() - dict_second = feat_extract_second.to_dict() - self.assertDictEqual(dict_first, dict_second) - def test_feat_extract_to_json_file(self): feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) @@ -358,15 +514,3 @@ def test_feat_extract_to_json_file(self): dict_first = feat_extract_first.to_dict() dict_second = feat_extract_second.to_dict() self.assertEqual(dict_first, dict_second) - - # test audio_utils implementation - feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False) - - with tempfile.TemporaryDirectory() as tmpdirname: - json_file_path = os.path.join(tmpdirname, "feat_extract.json") - feat_extract_first.to_json_file(json_file_path) - feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) - - dict_first = feat_extract_first.to_dict() - dict_second = feat_extract_second.to_dict() - self.assertEqual(dict_first, dict_second) diff --git a/tests/models/speech_to_text/test_processor_speech_to_text.py b/tests/models/speech_to_text/test_processor_speech_to_text.py index 9b8b3ccf66b212..976c7aeff65ab9 100644 --- a/tests/models/speech_to_text/test_processor_speech_to_text.py +++ b/tests/models/speech_to_text/test_processor_speech_to_text.py @@ -18,18 +18,13 @@ from pathlib import Path from shutil import copyfile -from transformers import Speech2TextTokenizer, is_speech_available +from transformers import Speech2TextTokenizer, Speech2TextFeatureExtractor, Speech2TextProcessor from transformers.models.speech_to_text.tokenization_speech_to_text import VOCAB_FILES_NAMES, save_json from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, require_torchaudio from transformers.utils import FEATURE_EXTRACTOR_NAME from .test_feature_extraction_speech_to_text import floats_list - -if is_speech_available(): - from transformers import Speech2TextFeatureExtractor, Speech2TextProcessor - - SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model") From b16b5a9a54aa67890a1aeaa48c098733c928ab31 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 29 Sep 2023 11:11:23 +0000 Subject: [PATCH 08/12] make style and fix import (remove torchaudio dependency import) --- src/transformers/__init__.py | 27 +++---------------- .../audio_spectrogram_transformer/__init__.py | 21 +++------------ ...xtraction_audio_spectrogram_transformer.py | 1 + .../models/speech_to_text/__init__.py | 19 ++----------- .../feature_extraction_speech_to_text.py | 4 ++- ...xtraction_audio_spectrogram_transformer.py | 14 ++++++---- .../test_feature_extraction_speech_to_text.py | 9 ++++--- .../test_processor_speech_to_text.py | 3 ++- 8 files changed, 30 insertions(+), 68 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index cd06fd001f290d..b9f0b663b96af6 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -146,6 +146,7 @@ "models.audio_spectrogram_transformer": [ "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ASTConfig", + "ASTFeatureExtractor", ], "models.auto": [ "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -514,6 +515,7 @@ "models.speech_to_text": [ "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Speech2TextConfig", + "Speech2TextFeatureExtractor", "Speech2TextProcessor", ], "models.speech_to_text_2": [ @@ -887,20 +889,6 @@ else: _import_structure["convert_slow_tokenizer"] = ["SLOW_TO_FAST_CONVERTERS", "convert_slow_tokenizer"] -# Speech-specific objects -try: - if not is_speech_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - from .utils import dummy_speech_objects - - _import_structure["utils.dummy_speech_objects"] = [ - name for name in dir(dummy_speech_objects) if not name.startswith("_") - ] -else: - _import_structure["models.audio_spectrogram_transformer"].append("ASTFeatureExtractor") - _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor") - # Tensorflow-text-specific objects try: if not is_tensorflow_text_available(): @@ -4275,6 +4263,7 @@ from .models.audio_spectrogram_transformer import ( AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ASTConfig, + ASTFeatureExtractor, ) from .models.auto import ( ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -4624,6 +4613,7 @@ from .models.speech_to_text import ( SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig, + Speech2TextFeatureExtractor, Speech2TextProcessor, ) from .models.speech_to_text_2 import ( @@ -4964,15 +4954,6 @@ else: from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, convert_slow_tokenizer - try: - if not is_speech_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from .utils.dummy_speech_objects import * - else: - from .models.audio_spectrogram_transformer import ASTFeatureExtractor - from .models.speech_to_text import Speech2TextFeatureExtractor - try: if not is_tensorflow_text_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py index 9aa42423cf5fda..2b48fe07311c1e 100644 --- a/src/transformers/models/audio_spectrogram_transformer/__init__.py +++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py @@ -13,14 +13,15 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_speech_available, is_torch_available +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available _import_structure = { "configuration_audio_spectrogram_transformer": [ "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ASTConfig", - ] + ], + "feature_extraction_audio_spectrogram_transformer": ["ASTFeatureExtractor"], } try: @@ -36,19 +37,13 @@ "ASTPreTrainedModel", ] -try: - if not is_speech_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["feature_extraction_audio_spectrogram_transformer"] = ["ASTFeatureExtractor"] if TYPE_CHECKING: from .configuration_audio_spectrogram_transformer import ( AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ASTConfig, ) + from .feature_extraction_audio_spectrogram_transformer import ASTFeatureExtractor try: if not is_torch_available(): @@ -63,14 +58,6 @@ ASTPreTrainedModel, ) - try: - if not is_speech_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .feature_extraction_audio_spectrogram_transformer import ASTFeatureExtractor - else: import sys diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index e1b299cb2e10c1..c4c9684e5d95f1 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -26,6 +26,7 @@ from ...feature_extraction_utils import BatchFeature from ...utils import TensorType, is_speech_available, logging + if is_speech_available(): import torchaudio.compliance.kaldi as ta_kaldi diff --git a/src/transformers/models/speech_to_text/__init__.py b/src/transformers/models/speech_to_text/__init__.py index 45a91c2b4962ab..3194f99931a4d6 100644 --- a/src/transformers/models/speech_to_text/__init__.py +++ b/src/transformers/models/speech_to_text/__init__.py @@ -17,7 +17,6 @@ OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, - is_speech_available, is_tf_available, is_torch_available, ) @@ -25,6 +24,7 @@ _import_structure = { "configuration_speech_to_text": ["SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Speech2TextConfig"], + "feature_extraction_speech_to_text": ["Speech2TextFeatureExtractor"], "processing_speech_to_text": ["Speech2TextProcessor"], } @@ -36,14 +36,6 @@ else: _import_structure["tokenization_speech_to_text"] = ["Speech2TextTokenizer"] -try: - if not is_speech_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["feature_extraction_speech_to_text"] = ["Speech2TextFeatureExtractor"] - try: if not is_tf_available(): raise OptionalDependencyNotAvailable() @@ -73,6 +65,7 @@ if TYPE_CHECKING: from .configuration_speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig + from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor from .processing_speech_to_text import Speech2TextProcessor try: @@ -83,14 +76,6 @@ else: from .tokenization_speech_to_text import Speech2TextTokenizer - try: - if not is_speech_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor - try: if not is_tf_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index 7f8943875d4ee1..83e19984949c30 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -20,7 +20,6 @@ import numpy as np import torch -import torchaudio.compliance.kaldi as ta_kaldi from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor @@ -28,6 +27,9 @@ from ...utils import PaddingStrategy, TensorType, is_speech_available, logging +if is_speech_available(): + import torchaudio.compliance.kaldi as ta_kaldi + logger = logging.get_logger(__name__) diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index e81c3614203363..d7d951eb96bea6 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -134,7 +134,7 @@ def test_call(self): encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) - + @require_torch def test_double_precision_pad(self): import torch @@ -174,7 +174,7 @@ def test_integration(self): input_values = feature_extractor(input_speech, return_tensors="pt").input_values self.assertEquals(input_values.shape, (1, 1024, 128)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) - + def test_feat_extract_from_and_save_pretrained(self): feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) @@ -201,16 +201,20 @@ def test_feat_extract_to_json_file(self): @require_torch -@unittest.mock.patch("transformers.models.audio_spectrogram_transformer.feature_extraction_audio_spectrogram_transformer.is_speech_available", lambda: False) +@unittest.mock.patch( + "transformers.models.audio_spectrogram_transformer.feature_extraction_audio_spectrogram_transformer.is_speech_available", + lambda: False, +) class ASTFeatureExtractionWithoutTorchaudioTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): feature_extraction_class = ASTFeatureExtractor + def setUp(self): self.feat_extract_tester = ASTFeatureExtractionTester(self) - + def test_using_audio_utils(self): # Tests that it uses audio_utils instead of torchaudio feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - + self.assertTrue(hasattr(feat_extract, "window")) self.assertTrue(hasattr(feat_extract, "mel_filters")) diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index e2dd3146c4fb92..9aa4d0bf75dc01 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -22,13 +22,12 @@ import numpy as np +from transformers import Speech2TextFeatureExtractor from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin -from transformers import Speech2TextFeatureExtractor - global_rng = random.Random() @@ -306,7 +305,9 @@ def test_feat_extract_to_json_file(self): @require_torch -@unittest.mock.patch("transformers.models.speech_to_text.feature_extraction_speech_to_text.is_speech_available", lambda: False) +@unittest.mock.patch( + "transformers.models.speech_to_text.feature_extraction_speech_to_text.is_speech_available", lambda: False +) class Speech2TextFeatureExtractionWithoutTorchaudioTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): feature_extraction_class = Speech2TextFeatureExtractor @@ -320,7 +321,7 @@ def _check_zero_mean_unit_variance(self, input_vector): def test_using_audio_utils(self): # Tests that it uses audio_utils instead of torchaudio feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - + self.assertTrue(hasattr(feat_extract, "window")) self.assertTrue(hasattr(feat_extract, "mel_filters")) diff --git a/tests/models/speech_to_text/test_processor_speech_to_text.py b/tests/models/speech_to_text/test_processor_speech_to_text.py index 976c7aeff65ab9..923ba29d1a8777 100644 --- a/tests/models/speech_to_text/test_processor_speech_to_text.py +++ b/tests/models/speech_to_text/test_processor_speech_to_text.py @@ -18,13 +18,14 @@ from pathlib import Path from shutil import copyfile -from transformers import Speech2TextTokenizer, Speech2TextFeatureExtractor, Speech2TextProcessor +from transformers import Speech2TextFeatureExtractor, Speech2TextProcessor, Speech2TextTokenizer from transformers.models.speech_to_text.tokenization_speech_to_text import VOCAB_FILES_NAMES, save_json from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, require_torchaudio from transformers.utils import FEATURE_EXTRACTOR_NAME from .test_feature_extraction_speech_to_text import floats_list + SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model") From 8af23136662152b0e4e76aefa8093c3cd53be29a Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 29 Sep 2023 11:25:12 +0000 Subject: [PATCH 09/12] fix torch dependency for jax and tensor tests --- .../feature_extraction_audio_spectrogram_transformer.py | 7 +++++-- .../speech_to_text/feature_extraction_speech_to_text.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index c4c9684e5d95f1..cc78a7bab11fcd 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -19,17 +19,20 @@ from typing import List, Optional, Union import numpy as np -import torch from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...feature_extraction_utils import BatchFeature -from ...utils import TensorType, is_speech_available, logging +from ...utils import TensorType, is_speech_available, is_torch_available, logging if is_speech_available(): import torchaudio.compliance.kaldi as ta_kaldi +if is_torch_available: + import torch + + logger = logging.get_logger(__name__) diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index 83e19984949c30..43bb8e81cd1746 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -19,7 +19,6 @@ from typing import List, Optional, Union import numpy as np -import torch from ...audio_utils import mel_filter_bank, spectrogram, window_function from ...feature_extraction_sequence_utils import SequenceFeatureExtractor @@ -28,6 +27,7 @@ if is_speech_available(): + import torch import torchaudio.compliance.kaldi as ta_kaldi logger = logging.get_logger(__name__) From 5e984762a84ee04981d6df4c5f1a1e5336c28fe5 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Fri, 29 Sep 2023 13:46:13 +0000 Subject: [PATCH 10/12] fix typo --- .../feature_extraction_audio_spectrogram_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index cc78a7bab11fcd..2bd122b4098c36 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -29,7 +29,7 @@ if is_speech_available(): import torchaudio.compliance.kaldi as ta_kaldi -if is_torch_available: +if is_torch_available(): import torch From c18ee1afa4bc2bfa8e3f7d618c289fdd77f28fdb Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Mon, 6 Nov 2023 14:01:38 +0000 Subject: [PATCH 11/12] clean tests with suggestions --- ...xtraction_audio_spectrogram_transformer.py | 98 +-------- .../test_feature_extraction_speech_to_text.py | 203 +----------------- 2 files changed, 4 insertions(+), 297 deletions(-) diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 6460c62eb9c7b7..2786a1e8319a1a 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -201,110 +201,16 @@ def test_feat_extract_to_json_file(self): self.assertEqual(dict_first, dict_second) +# exact same tests than before, except that we simulate that torchaudio is not available @require_torch @unittest.mock.patch( "transformers.models.audio_spectrogram_transformer.feature_extraction_audio_spectrogram_transformer.is_speech_available", lambda: False, ) -class ASTFeatureExtractionWithoutTorchaudioTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = ASTFeatureExtractor - - def setUp(self): - self.feat_extract_tester = ASTFeatureExtractionTester(self) - +class ASTFeatureExtractionWithoutTorchaudioTest(ASTFeatureExtractionTest): def test_using_audio_utils(self): # Tests that it uses audio_utils instead of torchaudio feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) self.assertTrue(hasattr(feat_extract, "window")) self.assertTrue(hasattr(feat_extract, "mel_filters")) - - def test_call(self): - # Tests that all call wrap to encode_plus and batch_encode_plus - feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - # create three inputs of length 800, 1000, and 1200 - speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] - np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] - - # Test not batched input - encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values - encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values - self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) - - # Test batched - encoded_sequences_1 = feat_extract(speech_inputs, padding=True, return_tensors="np").input_values - encoded_sequences_2 = feat_extract(np_speech_inputs, padding=True, return_tensors="np").input_values - for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): - self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) - - # Test 2-D numpy arrays are batched. - speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)] - np_speech_inputs = np.asarray(speech_inputs) - encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values - encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values - for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): - self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) - - @require_torch - def test_double_precision_pad(self): - import torch - - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - np_speech_inputs = np.random.rand(100).astype(np.float64) - py_speech_inputs = np_speech_inputs.tolist() - - for inputs in [py_speech_inputs, np_speech_inputs]: - np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np") - self.assertTrue(np_processed.input_values.dtype == np.float32) - pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt") - self.assertTrue(pt_processed.input_values.dtype == torch.float32) - - def _load_datasamples(self, num_samples): - from datasets import load_dataset - - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] - - return [x["array"] for x in speech_samples] - - @require_torch - def test_integration(self): - # fmt: off - EXPECTED_INPUT_VALUES = torch.tensor( - [-0.9894, -1.2776, -0.9066, -1.2776, -0.9349, -1.2609, -1.0386, -1.2776, - -1.1561, -1.2776, -1.2052, -1.2723, -1.2190, -1.2132, -1.2776, -1.1133, - -1.1953, -1.1343, -1.1584, -1.2203, -1.1770, -1.2474, -1.2381, -1.1936, - -0.9270, -0.8317, -0.8049, -0.7706, -0.7565, -0.7869] - ) - # fmt: on - - input_speech = self._load_datasamples(1) - feature_extractor = ASTFeatureExtractor() - input_values = feature_extractor(input_speech, return_tensors="pt").input_values - self.assertEquals(input_values.shape, (1, 1024, 128)) - self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) - - def test_feat_extract_from_and_save_pretrained(self): - feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) - - with tempfile.TemporaryDirectory() as tmpdirname: - saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] - check_json_file_has_correct_format(saved_file) - feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) - - dict_first = feat_extract_first.to_dict() - dict_second = feat_extract_second.to_dict() - self.assertDictEqual(dict_first, dict_second) - - def test_feat_extract_to_json_file(self): - feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) - - with tempfile.TemporaryDirectory() as tmpdirname: - json_file_path = os.path.join(tmpdirname, "feat_extract.json") - feat_extract_first.to_json_file(json_file_path) - feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) - - dict_first = feat_extract_first.to_dict() - dict_second = feat_extract_second.to_dict() - self.assertEqual(dict_first, dict_second) diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 62b4d427a6b5ef..afb0486ab82033 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -305,214 +305,15 @@ def test_feat_extract_to_json_file(self): self.assertEqual(dict_first, dict_second) +# exact same tests than before, except that we simulate that torchaudio is not available @require_torch @unittest.mock.patch( "transformers.models.speech_to_text.feature_extraction_speech_to_text.is_speech_available", lambda: False ) -class Speech2TextFeatureExtractionWithoutTorchaudioTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = Speech2TextFeatureExtractor - - def setUp(self): - self.feat_extract_tester = Speech2TextFeatureExtractionTester(self) - - def _check_zero_mean_unit_variance(self, input_vector): - self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) - self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3)) - +class Speech2TextFeatureExtractionWithoutTorchaudioTest(Speech2TextFeatureExtractionTest): def test_using_audio_utils(self): # Tests that it uses audio_utils instead of torchaudio feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) self.assertTrue(hasattr(feat_extract, "window")) self.assertTrue(hasattr(feat_extract, "mel_filters")) - - def test_call(self): - # Tests that all call wrap to encode_plus and batch_encode_plus - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - # create three inputs of length 800, 1000, and 1200 - speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] - np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] - - # Test feature size - input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features - self.assertTrue(input_features.ndim == 3) - self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size) - - # Test not batched input - encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features - encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features - self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) - - # Test batched - encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features - encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features - for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): - self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) - - # Test 2-D numpy arrays are batched. - speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)] - np_speech_inputs = np.asarray(speech_inputs) - encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features - encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features - for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): - self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) - - def test_cepstral_mean_and_variance_normalization(self): - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] - - paddings = ["longest", "max_length", "do_not_pad"] - max_lengths = [None, 16, None] - for max_length, padding in zip(max_lengths, paddings): - inputs = feature_extractor( - speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True - ) - input_features = inputs.input_features - attention_mask = inputs.attention_mask - fbank_feat_lengths = [np.sum(x) for x in attention_mask] - - self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]]) - self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]]) - self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]]) - - def test_cepstral_mean_and_variance_normalization_np(self): - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] - - paddings = ["longest", "max_length", "do_not_pad"] - max_lengths = [None, 16, None] - for max_length, padding in zip(max_lengths, paddings): - inputs = feature_extractor( - speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True - ) - input_features = inputs.input_features - attention_mask = inputs.attention_mask - fbank_feat_lengths = [np.sum(x) for x in attention_mask] - - self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]]) - self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6) - self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]]) - self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6) - self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]]) - - def test_cepstral_mean_and_variance_normalization_trunc_max_length(self): - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] - inputs = feature_extractor( - speech_inputs, - padding="max_length", - max_length=4, - truncation=True, - return_tensors="np", - return_attention_mask=True, - ) - input_features = inputs.input_features - attention_mask = inputs.attention_mask - fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) - - self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) - self._check_zero_mean_unit_variance(input_features[1]) - self._check_zero_mean_unit_variance(input_features[2]) - - def test_cepstral_mean_and_variance_normalization_trunc_longest(self): - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] - inputs = feature_extractor( - speech_inputs, - padding="longest", - max_length=4, - truncation=True, - return_tensors="np", - return_attention_mask=True, - ) - input_features = inputs.input_features - attention_mask = inputs.attention_mask - fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) - - self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) - self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]]) - self._check_zero_mean_unit_variance(input_features[2]) - - # make sure that if max_length < longest -> then pad to max_length - self.assertEqual(input_features.shape, (3, 4, 24)) - - speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] - inputs = feature_extractor( - speech_inputs, - padding="longest", - max_length=16, - truncation=True, - return_tensors="np", - return_attention_mask=True, - ) - input_features = inputs.input_features - attention_mask = inputs.attention_mask - fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) - - self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) - self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]]) - self._check_zero_mean_unit_variance(input_features[2]) - - # make sure that if max_length < longest -> then pad to max_length - self.assertEqual(input_features.shape, (3, 6, 24)) - - def test_double_precision_pad(self): - import torch - - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - np_speech_inputs = np.random.rand(100, 32).astype(np.float64) - py_speech_inputs = np_speech_inputs.tolist() - - for inputs in [py_speech_inputs, np_speech_inputs]: - np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np") - self.assertTrue(np_processed.input_features.dtype == np.float32) - pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt") - self.assertTrue(pt_processed.input_features.dtype == torch.float32) - - def _load_datasamples(self, num_samples): - from datasets import load_dataset - - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] - - return [x["array"] for x in speech_samples] - - def test_integration(self): - # fmt: off - expected = np.array([ - -1.5745, -1.7713, -1.7020, -1.6069, -1.2250, -1.1105, -0.9072, -0.8241, - -1.2310, -0.8098, -0.3320, -0.4101, -0.7985, -0.4996, -0.8213, -0.9128, - -1.0420, -1.1286, -1.0440, -0.7999, -0.8405, -1.2275, -1.5443, -1.4625, - ]) - # fmt: on - - input_speech = self._load_datasamples(1) - feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) - input_features = feature_extractor(input_speech, return_tensors="pt").input_features - self.assertEquals(input_features.shape, (1, 584, 24)) - self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) - - def test_feat_extract_from_and_save_pretrained(self): - feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) - - with tempfile.TemporaryDirectory() as tmpdirname: - saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] - check_json_file_has_correct_format(saved_file) - feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) - - dict_first = feat_extract_first.to_dict() - dict_second = feat_extract_second.to_dict() - self.assertDictEqual(dict_first, dict_second) - - def test_feat_extract_to_json_file(self): - feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) - - with tempfile.TemporaryDirectory() as tmpdirname: - json_file_path = os.path.join(tmpdirname, "feat_extract.json") - feat_extract_first.to_json_file(json_file_path) - feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) - - dict_first = feat_extract_first.to_dict() - dict_second = feat_extract_second.to_dict() - self.assertEqual(dict_first, dict_second) From 4dd6207a42ecd44e5810cf82e3ea9ff88be397ef Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Tue, 7 Nov 2023 21:17:54 +0000 Subject: [PATCH 12/12] add lines to test if is_speech_availble is False --- ...test_feature_extraction_audio_spectrogram_transformer.py | 6 ++++++ .../test_feature_extraction_speech_to_text.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 2786a1e8319a1a..ac6cd5eb1fbc80 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -214,3 +214,9 @@ def test_using_audio_utils(self): self.assertTrue(hasattr(feat_extract, "window")) self.assertTrue(hasattr(feat_extract, "mel_filters")) + + from transformers.models.audio_spectrogram_transformer.feature_extraction_audio_spectrogram_transformer import ( + is_speech_available, + ) + + self.assertFalse(is_speech_available()) diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index afb0486ab82033..f652d09ffca5d0 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -317,3 +317,7 @@ def test_using_audio_utils(self): self.assertTrue(hasattr(feat_extract, "window")) self.assertTrue(hasattr(feat_extract, "mel_filters")) + + from transformers.models.speech_to_text.feature_extraction_speech_to_text import is_speech_available + + self.assertFalse(is_speech_available())