Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add numpy alternative to FE using torchaudio #26339

Merged
merged 14 commits into from
Nov 8, 2023
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
Feature extractor class for Audio Spectrogram Transformer.
"""

import copy
from typing import List, Optional, Union

import numpy as np
import torch
import torchaudio.compliance.kaldi as ta_kaldi

from ...audio_utils import mel_filter_bank, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import TensorType, logging
Expand Down Expand Up @@ -58,6 +60,9 @@ class ASTFeatureExtractor(SequenceFeatureExtractor):
by default.
return_attention_mask (`bool`, *optional*, defaults to `False`):
Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
use_torchaudio (`bool`, *optional*, defaults to `True`):
Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of
torchaudio mel-filter banks implementation.
"""

model_input_names = ["input_values", "attention_mask"]
Expand All @@ -73,6 +78,7 @@ def __init__(
mean=-4.2677393,
std=4.5689974,
return_attention_mask=False,
use_torchaudio=True,
**kwargs,
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
Expand All @@ -83,6 +89,22 @@ def __init__(
self.std = std
self.return_attention_mask = return_attention_mask

self.use_torchaudio = use_torchaudio
if not use_torchaudio:
mel_filters = mel_filter_bank(
num_frequency_bins=256,
num_mel_filters=self.num_mel_bins,
min_frequency=20,
max_frequency=sampling_rate // 2,
sampling_rate=sampling_rate,
norm=None,
mel_scale="kaldi",
triangularize_in_mel_space=True,
)

self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
self.window = window_function(400, "hann", periodic=False)

def _extract_fbank_features(
self,
waveform: np.ndarray,
Expand All @@ -93,17 +115,32 @@ def _extract_fbank_features(
and hence the waveform should not be normalized before feature extraction.
"""
# waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers
waveform = torch.from_numpy(waveform).unsqueeze(0)
fbank = ta_kaldi.fbank(
waveform,
htk_compat=True,
sample_frequency=self.sampling_rate,
use_energy=False,
window_type="hanning",
num_mel_bins=self.num_mel_bins,
dither=0.0,
frame_shift=10,
)
Comment on lines -97 to -106
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also took the opportunity to remove some unnecessary parameters here

if self.use_torchaudio:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there's a need to have the use_torchaudio argument. IMO we can execute the torchaudio code if_torchaudio_is_available (thus maintaining backwards comp), and the NumPy code otherwise

if_torchaudio_is_available():
    # do legacy code
else:
    # do numpy code

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also fine with removing the legacy torchaudio code altogether. I know this makes the feature extraction quite a bit slower, but I think this is fine to remove the extra dependencies to bring these models in-line with the rest of the audio library.

Personally, I would favour this approach over supporting both methods for feature extraction (torchaudio and numpy). IMO having both methods convolutes the code quite a lot, which is something we want to avoid.

Copy link
Collaborator

@ArthurZucker ArthurZucker Sep 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine with me to remove the previous code, it won’t be performance wise backward compatible 🫠

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's go with the first option here then? Decorate with if_torchaudio_is_available?

waveform = torch.from_numpy(waveform).unsqueeze(0)
fbank = ta_kaldi.fbank(
waveform,
sample_frequency=self.sampling_rate,
window_type="hanning",
num_mel_bins=self.num_mel_bins,
)
else:
waveform = np.squeeze(waveform)
fbank = spectrogram(
waveform,
self.window,
frame_length=400,
hop_length=160,
fft_length=512,
power=2.0,
center=False,
preemphasis=0.97,
mel_filters=self.mel_filters,
log_mel="log",
mel_floor=1.192092955078125e-07,
remove_dc_offset=True,
).T

fbank = torch.from_numpy(fbank)

n_frames = fbank.shape[0]
difference = max_length - n_frames
Expand Down Expand Up @@ -198,3 +235,16 @@ def __call__(
padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

return padded_inputs

def to_dict(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this method strictly necessary? If it is, shouldn't it go in the base FeatureExtractionMixin class? Rather than copying it out for every feature extractor?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which method do you mean? to_dict?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it's about time we add this to the base feature class!
(it's necessary if we support the numpy part)

"""
Serializes this instance to a Python dictionary. Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
"""
output = copy.deepcopy(self.__dict__)
output["feature_extractor_type"] = self.__class__.__name__
if "mel_filters" in output:
del output["mel_filters"]
if "window" in output:
del output["window"]
return output
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
Feature extractor class for Speech2Text
"""

import copy
from typing import List, Optional, Union

import numpy as np
import torch
import torchaudio.compliance.kaldi as ta_kaldi

from ...audio_utils import mel_filter_bank, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import PaddingStrategy, TensorType, logging
Expand Down Expand Up @@ -55,6 +57,9 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
Whether or not to zero-mean normalize the extracted features.
normalize_vars (`bool`, *optional*, defaults to `True`):
Whether or not to unit-variance normalize the extracted features.
use_torchaudio (`bool`, *optional*, defaults to `True`):
Whether or not to use torchaudio implementation of mel-filter banks. If `False`, use a numpy porting of
torchaudio mel-filter banks implementation.
"""

model_input_names = ["input_features", "attention_mask"]
Expand All @@ -68,6 +73,7 @@ def __init__(
do_ceptral_normalize=True,
normalize_means=True,
normalize_vars=True,
use_torchaudio=True,
**kwargs,
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
Expand All @@ -77,6 +83,22 @@ def __init__(
self.normalize_vars = normalize_vars
self.return_attention_mask = True

self.use_torchaudio = use_torchaudio
if not use_torchaudio:
mel_filters = mel_filter_bank(
num_frequency_bins=256,
num_mel_filters=self.num_mel_bins,
min_frequency=20,
max_frequency=sampling_rate // 2,
sampling_rate=sampling_rate,
norm=None,
mel_scale="kaldi",
triangularize_in_mel_space=True,
)

self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
self.window = window_function(400, "povey", periodic=False)

def _extract_fbank_features(
self,
waveform: np.ndarray,
Expand All @@ -86,9 +108,27 @@ def _extract_fbank_features(
and hence the waveform should not be normalized before feature extraction.
"""
waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers
waveform = torch.from_numpy(waveform).unsqueeze(0)
features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
return features.numpy()
if self.use_torchaudio:
waveform = torch.from_numpy(waveform).unsqueeze(0)
features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
features = features.numpy()
else:
waveform = np.squeeze(waveform)
features = spectrogram(
waveform,
self.window,
frame_length=400,
hop_length=160,
fft_length=512,
power=2.0,
center=False,
preemphasis=0.97,
mel_filters=self.mel_filters,
log_mel="log",
mel_floor=1.192092955078125e-07,
remove_dc_offset=True,
).T
return features

@staticmethod
def utterance_cmvn(
Expand Down Expand Up @@ -259,3 +299,16 @@ def __call__(
padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

return padded_inputs

def to_dict(self):
"""
Serializes this instance to a Python dictionary. Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
"""
output = copy.deepcopy(self.__dict__)
output["feature_extractor_type"] = self.__class__.__name__
if "mel_filters" in output:
del output["mel_filters"]
if "window" in output:
del output["window"]
return output
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@


import itertools
import os
import random
import tempfile
import unittest

import numpy as np

from transformers import ASTFeatureExtractor
from transformers.testing_utils import require_torch, require_torchaudio
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
from transformers.utils.import_utils import is_torch_available

from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
Expand Down Expand Up @@ -133,6 +135,34 @@ def test_call(self):
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))

def test_call_audio_utils(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
feat_extract = self.feature_extraction_class(
**self.feat_extract_tester.prepare_feat_extract_dict(), use_torchaudio=False
)
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]

# Test not batched input
encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))

# Test batched
encoded_sequences_1 = feat_extract(speech_inputs, padding=True, return_tensors="np").input_values
encoded_sequences_2 = feat_extract(np_speech_inputs, padding=True, return_tensors="np").input_values
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))

# Test 2-D numpy arrays are batched.
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
np_speech_inputs = np.asarray(speech_inputs)
encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))

@require_torch
def test_double_precision_pad(self):
import torch
Expand Down Expand Up @@ -172,3 +202,57 @@ def test_integration(self):
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
self.assertEquals(input_values.shape, (1, 1024, 128))
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))

# test audio_utils implementation
feature_extractor = ASTFeatureExtractor(use_torchaudio=False)
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
self.assertEquals(input_values.shape, (1, 1024, 128))
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))

def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)

with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)

dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
self.assertDictEqual(dict_first, dict_second)

# test audio_utils implementation
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False)

with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)

dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
self.assertDictEqual(dict_first, dict_second)

def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)

with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)

dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
self.assertEqual(dict_first, dict_second)

# test audio_utils implementation
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict, use_torchaudio=False)

with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)

dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
self.assertEqual(dict_first, dict_second)
Loading
Loading