diff --git a/espnet2/train/preprocessor.py b/espnet2/train/preprocessor.py index 54797e898eb..7f5da2cff19 100644 --- a/espnet2/train/preprocessor.py +++ b/espnet2/train/preprocessor.py @@ -417,7 +417,8 @@ def _speech_process( if self.speech_volume_normalize is not None: speech = data[self.speech_name] ma = np.max(np.abs(speech)) - data[self.speech_name] = speech * self.speech_volume_normalize / ma + if ma != 0: + data[self.speech_name] = speech * self.speech_volume_normalize / ma return data def _text_process( @@ -1420,7 +1421,10 @@ def _speech_process( # use a fixed scale to make it deterministic volume_scale = self.volume_low ma = np.max(np.abs(data[self.speech_name])) - self._apply_to_all_signals(data, lambda x: x * volume_scale / ma, num_spk) + if ma != 0: + self._apply_to_all_signals( + data, lambda x: x * volume_scale / ma, num_spk + ) if self.categories and "category" in data: category = data.pop("category") @@ -1531,7 +1535,10 @@ def __call__( if self.singing_volume_normalize is not None: singing = data[self.singing_name] ma = np.max(np.abs(singing)) - data[self.singing_name] = singing * self.singing_volume_normalize / ma + if ma != 0: + data[self.singing_name] = ( + singing * self.singing_volume_normalize / ma + ) if self.midi_name in data and self.label_name in data: # Load label info