From 00b72f279610aefc75158bbe3ee692ec01ac71b2 Mon Sep 17 00:00:00 2001 From: Rodrigo Loyola Date: Mon, 4 Dec 2023 16:11:15 +0100 Subject: [PATCH 1/4] Update sklearn to scikit-learn in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8fbc0e1..12aeb07 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ 'soundfile', 'scipy', 'librosa', - 'sklearn', + 'scikit-learn', 'pyloudnorm', 'six' ], From 1540684a3690b1e110e3075caad7f68ec58d1074 Mon Sep 17 00:00:00 2001 From: Rodrigo Loyola Date: Tue, 5 Dec 2023 14:29:25 +0100 Subject: [PATCH 2/4] Changed the librosa call in timbral_util.py by adding y and sr --- timbral_models/timbral_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timbral_models/timbral_util.py b/timbral_models/timbral_util.py index 30a209e..01c7255 100644 --- a/timbral_models/timbral_util.py +++ b/timbral_models/timbral_util.py @@ -639,7 +639,7 @@ def calculate_onsets(audio_samples, envelope_samples, fs, look_back_time=20, hys value of [0] is also possible during normal opperation. """ # get onsets with librosa estimation - onsets = librosa.onset.onset_detect(audio_samples, fs, backtrack=True, units='samples') + onsets = librosa.onset.onset_detect(y=audio_samples, sr=fs, backtrack=True, units='samples') # set values for return_loop method time_thresh = int(look_back_time * 0.001 * fs) # 10 ms default look-back time, in samples From 4cb3b144ddae641058652d3fc27301a22f6056af Mon Sep 17 00:00:00 2001 From: Rodrigo Loyola Date: Tue, 5 Dec 2023 15:07:12 +0100 Subject: [PATCH 3/4] Changed the librosa onset call by adding y= and sr= --- build/lib/timbral_models/Timbral_Booming.py | 156 ++ .../lib/timbral_models/Timbral_Brightness.py | 186 ++ build/lib/timbral_models/Timbral_Depth.py | 289 +++ build/lib/timbral_models/Timbral_Extractor.py | 140 ++ build/lib/timbral_models/Timbral_Hardness.py | 274 +++ build/lib/timbral_models/Timbral_Reverb.py | 370 ++++ build/lib/timbral_models/Timbral_Roughness.py | 185 ++ build/lib/timbral_models/Timbral_Sharpness.py | 120 ++ build/lib/timbral_models/Timbral_Warmth.py | 263 +++ build/lib/timbral_models/__init__.py | 10 + build/lib/timbral_models/timbral_util.py | 1816 +++++++++++++++++ timbral_models.egg-info/PKG-INFO | 158 +- timbral_models.egg-info/SOURCES.txt | 3 + timbral_models.egg-info/requires.txt | 4 +- timbral_models/timbral_util.py | 2 +- 15 files changed, 3907 insertions(+), 69 deletions(-) create mode 100644 build/lib/timbral_models/Timbral_Booming.py create mode 100644 build/lib/timbral_models/Timbral_Brightness.py create mode 100644 build/lib/timbral_models/Timbral_Depth.py create mode 100644 build/lib/timbral_models/Timbral_Extractor.py create mode 100644 build/lib/timbral_models/Timbral_Hardness.py create mode 100644 build/lib/timbral_models/Timbral_Reverb.py create mode 100644 build/lib/timbral_models/Timbral_Roughness.py create mode 100644 build/lib/timbral_models/Timbral_Sharpness.py create mode 100644 build/lib/timbral_models/Timbral_Warmth.py create mode 100644 build/lib/timbral_models/__init__.py create mode 100644 build/lib/timbral_models/timbral_util.py diff --git a/build/lib/timbral_models/Timbral_Booming.py b/build/lib/timbral_models/Timbral_Booming.py new file mode 100644 index 0000000..5786c61 --- /dev/null +++ b/build/lib/timbral_models/Timbral_Booming.py @@ -0,0 +1,156 @@ +from __future__ import division +import numpy as np +import soundfile as sf +from . import timbral_util + + +def boominess_calculate(loudspec): + """ + Calculates the Booming Index as described by Hatano, S., and Hashimoto, T. "Booming index as a measure for + evaluating booming sensation", The 29th International congress and Exhibition on Noise Control Engineering, 2000. + """ + + # loudspec from the loudness_1991 code results in values from 0.1 to 24 Bark in 0.1 steps + z = np.arange(0.1, 24.05, 0.1) #0.1 to 24 bark in 0.1 steps + f = 600 * np.sinh(z / 6.0) # convert these bark values to frequency + FR = [25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600, 2000, 2500, + 3150, 4000, 5000, 6300, 8000, 10000, 12500] # get the centre frequencies of 3rd octave bands + + # now I need to convert f onto the FR scale + logFR = np.log10(FR) + FR_step = logFR[1] - logFR[0] # get the step size on the log scale + FR_min = logFR[0] # get the minimum value of the logFR + + logf = np.log10(f) # get the log version of estimated frequencies + # estimate the indexes of the bark scale on the 3rd octave scale + estimated_index = ((logf - FR_min) / float(FR_step)) + 1 + + # weighting function based from the estimated indexes + Weighting_function = 2.13 * np.exp(-0.151 * estimated_index) + + # change the LF indexes to roll off + Weighting_function[0] = 0.8 # this value is estimated + Weighting_function[1] = 1.05 + Weighting_function[2] = 1.10 + Weighting_function[3] = 1.18 + + # identify index where frequency is less than 280Hz + below_280_idx = np.where(f >= 280)[0][0] + + I = loudspec * Weighting_function + loudness = np.sum(loudspec) + Ll = np.sum(loudspec[:below_280_idx]) + + Bandsum = timbral_util.log_sum(I) + BoomingIndex = Bandsum * (Ll / loudness) + + return BoomingIndex + + +def timbral_booming(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False): + """ + This is an implementation of the hasimoto booming index feature. + There are a few fudge factors with the code to convert between the internal representation of the sound using the + same loudness calculation as the sharpness code. The equation for calculating the booming index is not + specifically quoted anywhere so I've done the best i can with the code that was presented. + + Shin, SH, Ih, JG, Hashimoto, T., and Hatano, S.: "Sound quality evaluation of the booming sensation for passenger + cars", Applied Acoustics, Vol. 70, 2009. + + Hatano, S., and Hashimoto, T. "Booming index as a measure for + evaluating booming sensation", The 29th International congress and Exhibition on Noise Control Engineering, 2000. + + This function calculates the apparent Boominess of an audio file. + + This version of timbral_booming contains self loudness normalising methods and can accept arrays as an input + instead of a string filename. + + Version 0.4 + + Required parameter + :param fname: string or numpy array + string, audio filename to be analysed, including full file path and extension. + numpy array, array of audio samples, requires fs to be set to the sample rate. + + Optional parameters + :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate. + Defaults to 0. + :param dev_output: bool, when False return the warmth, when True return all extracted features. + Defaults to False. + :param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum. + Defaults to False. + :param clip_output: bool, force the output to be between 0 and 100. + + :return float, apparent boominess of the audio file. + + Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + ''' + Read input + ''' + audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction) + + + # window the audio file into 4096 sample sections + windowed_audio = timbral_util.window_audio(audio_samples, window_length=4096) + + windowed_booming = [] + windowed_rms = [] + for i in range(windowed_audio.shape[0]): + samples = windowed_audio[i, :] # the current time window + # get the rms value and append to list + windowed_rms.append(np.sqrt(np.mean(samples * samples))) + + # calculate the specific loudness + N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0) + + # calculate the booming index is contains a level + if N_entire > 0: + # boom = boominess_calculate(N_single) + BoomingIndex = boominess_calculate(N_single) + else: + BoomingIndex = 0 + + windowed_booming.append(BoomingIndex) + + # get level of low frequencies + ll, w_ll = timbral_util.weighted_bark_level(audio_samples, fs, 0, 70) + + ll = np.log10(ll) + # convert to numpy arrays for fancy indexing + windowed_booming = np.array(windowed_booming) + windowed_rms = np.array(windowed_rms) + + # get the weighted average + rms_boom = np.average(windowed_booming, weights=(windowed_rms * windowed_rms)) + rms_boom = np.log10(rms_boom) + + if dev_output: + return [rms_boom, ll] + else: + + # perform thye linear regression + all_metrics = np.ones(3) + all_metrics[0] = rms_boom + all_metrics[1] = ll + + coefficients = np.array([43.67402696195865, -10.90054738389845, 26.836530575185435]) + + boominess = np.sum(all_metrics * coefficients) + + if clip_output: + boominess = timbral_util.output_clip(boominess) + + return boominess diff --git a/build/lib/timbral_models/Timbral_Brightness.py b/build/lib/timbral_models/Timbral_Brightness.py new file mode 100644 index 0000000..7d6a945 --- /dev/null +++ b/build/lib/timbral_models/Timbral_Brightness.py @@ -0,0 +1,186 @@ +from __future__ import division +import numpy as np +import soundfile as sf +from . import timbral_util +from scipy.signal import spectrogram + + +def timbral_brightness(fname, fs=0, dev_output=False, clip_output=False, phase_correction=False, threshold=0, + ratio_crossover=2000, centroid_crossover=100, stepSize=1024, blockSize=2048, minFreq=20): + """ + This function calculates the apparent Brightness of an audio file. + This version of timbral_brightness contains self loudness normalising methods and can accept arrays as an input + instead of a string filename. + + Version 0.4 + + Required parameter + :param fname: string or numpy array + string, audio filename to be analysed, including full file path and extension. + numpy array, array of audio samples, requires fs to be set to the sample rate. + + Optional parameters + :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate. + Defaults to 0. + :param dev_output: bool, when False return the brightness, when True return all extracted features. + :param clip_output: bool, force the output to be between 0 and 100. + :param phase_correction: bool, Perform phase checking before summing to mono. + :param threshold: Threshold below which to ignore the energy in a time window, default to 0. + :param ratio_crossover: Crossover frequency for calculating the HF energy ratio, default to 2000 Hz. + :param centroid_crossover: Highpass frequency for calculating the spectral centroid, default to 100 Hz. + :param stepSize: Step size for calculating spectrogram, default to 1024. + :param blockSize: Block size (fft length) for calculating spectrogram, default to 2048. + :param minFreq: Frequency for high-pass filtering audio prior to all analysis, default to 20 Hz. + + :return: Apparent brightness of audio file, float. + + Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + ''' + Read input + ''' + audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction) + + ''' + Filter audio + ''' + # highpass audio at minimum frequency + audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs) + audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs) + audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=minFreq, fs=fs) + + # get highpass audio at ratio crossover + ratio_highpass_audio = timbral_util.filter_audio_highpass(audio_samples, ratio_crossover, fs) + ratio_highpass_audio = timbral_util.filter_audio_highpass(ratio_highpass_audio, ratio_crossover, fs) + ratio_highpass_audio = timbral_util.filter_audio_highpass(ratio_highpass_audio, ratio_crossover, fs) + + # get highpass audio at centroid crossover + centroid_highpass_audio = timbral_util.filter_audio_highpass(audio_samples, centroid_crossover, fs) + centroid_highpass_audio = timbral_util.filter_audio_highpass(centroid_highpass_audio, centroid_crossover, fs) + centroid_highpass_audio = timbral_util.filter_audio_highpass(centroid_highpass_audio, centroid_crossover, fs) + + ''' + Get spectrograms + ''' + # normalise audio to the maximum value in the unfiltered audio + ratio_highpass_audio *= (1.0 / max(abs(audio_samples))) + centroid_highpass_audio *= (1.0 / max(abs(audio_samples))) + audio_samples *= (1.0 / max(abs(audio_samples))) + + + # set FFT parameters + nfft = blockSize + hop_size = int(3 * nfft / 4) + + # check that audio is long enough to generate spectrograms + if len(audio_samples) >= nfft: + # get spectrogram + ratio_all_freq, ratio_all_time, ratio_all_spec = spectrogram(audio_samples, fs, 'hamming', nfft, + hop_size, nfft, 'constant', True, 'spectrum') + ratio_hp_freq, ratio_hp_time, ratio_hp_spec = spectrogram(ratio_highpass_audio, fs, 'hamming', nfft, + hop_size, nfft, 'constant', True, 'spectrum') + centroid_hp_freq, centroid_hp_time, centroid_hp_spec = spectrogram(centroid_highpass_audio, fs, 'hamming', nfft, + hop_size, nfft, 'constant', True, 'spectrum') + else: + ratio_all_freq, ratio_all_time, ratio_all_spec = spectrogram(audio_samples, fs, 'hamming', + len(audio_samples), + len(audio_samples)-1, + nfft, 'constant', True, 'spectrum') + ratio_hp_freq, ratio_hp_time, ratio_hp_spec = spectrogram(ratio_highpass_audio, fs, 'hamming', + len(ratio_highpass_audio), + len(ratio_highpass_audio)-1, + nfft, 'constant', True, 'spectrum') + centroid_hp_freq, centroid_hp_time, centroid_hp_spec = spectrogram(centroid_highpass_audio, fs, 'hamming', + len(centroid_highpass_audio), + len(centroid_highpass_audio)-1, + nfft, 'constant', True, 'spectrum') + + # initialise variables for storing data + all_ratio = [] + all_hp_centroid = [] + all_tpower = [] + all_hp_centroid_tpower = [] + + # set threshold level at zero + threshold_db = threshold + if threshold_db == 0: + threshold = 0 + hp_threshold = 0 + else: + max_power = max(np.sum(ratio_all_spec, axis=1)) + threshold = max_power * timbral_util.db2mag(threshold_db) + # get the threshold for centroid + # centroid_hp_max_power = max(np.sum(centroid_hp_spec, axis=1)) + # hp_min_power = min(np.sum(hp_spec, axis=1)) + # hp_threshold = hp_max_power * timbral_util.db2mag(threshold_db) + # threshold = 0.0 + + ''' + Calculate features for each time window + ''' + for idx in range(len(ratio_hp_time)): # + # get the current spectrum for this time window + current_ratio_hp_spec = ratio_hp_spec[:, idx] + current_ratio_all_spec = ratio_all_spec[:, idx] + current_centroid_hp_spec = centroid_hp_spec[:, idx] + + # get the power within each spectrum + tpower = np.sum(current_ratio_all_spec) + hp_tpower = np.sum(current_ratio_hp_spec) + # check there is energy in the time window before calculating the ratio (greater than 0) + if tpower > threshold: + # get the ratio + all_ratio.append(hp_tpower / tpower) + # store the powef for weighting + all_tpower.append(tpower) + + # get the tpower to assure greater than zero + hp_centroid_tpower = np.sum(current_centroid_hp_spec) + if hp_centroid_tpower > 0.0: + # get the centroid + all_hp_centroid.append(np.sum(current_centroid_hp_spec * centroid_hp_freq[:len(current_centroid_hp_spec)]) / + np.sum(current_centroid_hp_spec)) + # store the tpower for weighting + all_hp_centroid_tpower.append(hp_centroid_tpower) + + ''' + Get mean and weighted average values + ''' + mean_ratio = np.mean(all_ratio) + mean_hp_centroid = np.mean(all_hp_centroid) + + weighted_mean_ratio = np.average(all_ratio, weights=all_tpower) + weighted_mean_hp_centroid = np.average(all_hp_centroid, weights=all_hp_centroid_tpower) + + if dev_output: + # return the ratio and centroid + return np.log10(weighted_mean_ratio), np.log10(weighted_mean_hp_centroid) + else: + # perform thye linear regression + all_metrics = np.ones(3) + all_metrics[0] = np.log10(weighted_mean_ratio) + all_metrics[1] = np.log10(weighted_mean_hp_centroid) + # all_metrics[2] = np.log10(weighted_mean_ratio) * np.log10(weighted_mean_hp_centroid) + + + coefficients = np.array([4.613128018020465, 17.378889309312974, 17.434733750553022]) + + # coefficients = np.array([-2.9197705625030235, 9.048261758526614, 3.940747859061009, 47.989783427908705]) + bright = np.sum(all_metrics * coefficients) + + if clip_output: + bright = timbral_util.output_clip(bright) + + return bright diff --git a/build/lib/timbral_models/Timbral_Depth.py b/build/lib/timbral_models/Timbral_Depth.py new file mode 100644 index 0000000..6c12496 --- /dev/null +++ b/build/lib/timbral_models/Timbral_Depth.py @@ -0,0 +1,289 @@ +from __future__ import division +import numpy as np +import soundfile as sf +from scipy.signal import spectrogram +import scipy.stats +from . import timbral_util + + +def timbral_depth(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, threshold_db=-60, + low_frequency_limit=20, centroid_crossover_frequency=2000, ratio_crossover_frequency=500, + db_decay_threshold=-40): + """ + This function calculates the apparent Depth of an audio file. + This version of timbral_depth contains self loudness normalising methods and can accept arrays as an input + instead of a string filename. + + Version 0.4 + + Required parameter + :param fname: string or numpy array + string, audio filename to be analysed, including full file path and extension. + numpy array, array of audio samples, requires fs to be set to the sample rate. + + Optional parameters + :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate. + Defaults to 0. + :param phase_correction: bool, perform phase checking before summing to mono. Defaults to False. + :param dev_output: bool, when False return the depth, when True return all extracted + features. Default to False. + :param threshold_db: float/int (negative), threshold, in dB, for calculating centroids. + Should be negative. Defaults to -60. + :param low_frequency_limit: float/int, low frequency limit at which to highpass filter the audio, in Hz. + Defaults to 20. + :param centroid_crossover_frequency: float/int, crossover frequency for calculating the spectral centroid, in Hz. + Defaults to 2000 + :param ratio_crossover_frequency: float/int, crossover frequency for calculating the ratio, in Hz. + Defaults to 500. + + :param db_decay_threshold: float/int (negative), threshold, in dB, for estimating duration. Should be + negative. Defaults to -40. + + :return: float, aparent depth of audio file, float. + + Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + ''' + Read input + ''' + audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction) + + ''' + Filter audio + ''' + # highpass audio - run 3 times to get -18dB per octave - unstable filters produced when using a 6th order + audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs) + audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs) + audio_samples = timbral_util.filter_audio_highpass(audio_samples, crossover=low_frequency_limit, fs=fs) + + # running 3 times to get -18dB per octave rolloff, greater than second order filters are unstable in python + lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(audio_samples,crossover=centroid_crossover_frequency,fs=fs) + lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(lowpass_centroid_audio_samples,crossover=centroid_crossover_frequency,fs=fs) + lowpass_centroid_audio_samples = timbral_util.filter_audio_lowpass(lowpass_centroid_audio_samples,crossover=centroid_crossover_frequency,fs=fs) + + lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(audio_samples,crossover=ratio_crossover_frequency,fs=fs) + lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(lowpass_ratio_audio_samples,crossover=ratio_crossover_frequency,fs=fs) + lowpass_ratio_audio_samples = timbral_util.filter_audio_lowpass(lowpass_ratio_audio_samples,crossover=ratio_crossover_frequency,fs=fs) + + ''' + Get spectrograms and normalise + ''' + # normalise audio + lowpass_ratio_audio_samples *= (1.0 / max(abs(audio_samples))) + lowpass_centroid_audio_samples *= (1.0 / max(abs(audio_samples))) + audio_samples *= (1.0 / max(abs(audio_samples))) + + # set FFT parameters + nfft = 4096 + hop_size = int(3 * nfft / 4) + # get spectrogram + if len(audio_samples) > nfft: + freq, time, spec = spectrogram(audio_samples, fs, 'hamming', nfft, hop_size, + nfft, 'constant', True, 'spectrum') + lp_centroid_freq, lp_centroid_time, lp_centroid_spec = spectrogram(lowpass_centroid_audio_samples, fs, + 'hamming', nfft, hop_size, nfft, + 'constant', True, 'spectrum') + lp_ratio_freq, lp_ratio_time, lp_ratio_spec = spectrogram(lowpass_ratio_audio_samples, fs, 'hamming', nfft, + hop_size, nfft, 'constant', True, 'spectrum') + + else: + # file is shorter than 4096, just take the fft + freq, time, spec = spectrogram(audio_samples, fs, 'hamming', len(audio_samples), len(audio_samples)-1, + nfft, 'constant', True, 'spectrum') + lp_centroid_freq, lp_centroid_time, lp_centroid_spec = spectrogram(lowpass_centroid_audio_samples, fs, + 'hamming', + len(lowpass_centroid_audio_samples), + len(lowpass_centroid_audio_samples)-1, + nfft, 'constant', True, 'spectrum') + lp_ratio_freq, lp_ratio_time, lp_ratio_spec = spectrogram(lowpass_ratio_audio_samples, fs, 'hamming', + len(lowpass_ratio_audio_samples), + len(lowpass_ratio_audio_samples)-1, + nfft, 'constant', True, 'spectrum') + + + + threshold = timbral_util.db2mag(threshold_db) + + + ''' + METRIC 1 - limited weighted mean normalised lower centroid + ''' + # define arrays for storing metrics + all_normalised_lower_centroid = [] + all_normalised_centroid_tpower = [] + + # get metrics for each time segment of the spectrogram + for idx in range(len(time)): + # get overall spectrum of time frame + current_spectrum = spec[:, idx] + # calculate time window power + tpower = np.sum(current_spectrum) + all_normalised_centroid_tpower.append(tpower) + + # estimate if time segment contains audio energy or just noise + if tpower > threshold: + # get the spectrum + lower_spectrum = lp_centroid_spec[:, idx] + lower_power = np.sum(lower_spectrum) + + # get lower centroid + lower_centroid = np.sum(lower_spectrum * lp_centroid_freq) / float(lower_power) + + # append to list + all_normalised_lower_centroid.append(lower_centroid) + else: + all_normalised_lower_centroid.append(0) + + # calculate the weighted mean of lower centroids + weighted_mean_normalised_lower_centroid = np.average(all_normalised_lower_centroid, + weights=all_normalised_centroid_tpower) + # limit to the centroid crossover frequency + if weighted_mean_normalised_lower_centroid > centroid_crossover_frequency: + limited_weighted_mean_normalised_lower_centroid = np.float64(centroid_crossover_frequency) + else: + limited_weighted_mean_normalised_lower_centroid = weighted_mean_normalised_lower_centroid + + + + ''' + METRIC 2 - weighted mean normalised lower ratio + ''' + # define arrays for storing metrics + all_normalised_lower_ratio = [] + all_normalised_ratio_tpower = [] + + # get metrics for each time segment of the spectrogram + for idx in range(len(time)): + # get time frame of broadband spectrum + current_spectrum = spec[:, idx] + tpower = np.sum(current_spectrum) + all_normalised_ratio_tpower.append(tpower) + + # estimate if time segment contains audio energy or just noise + if tpower > threshold: + # get the lowpass spectrum + lower_spectrum = lp_ratio_spec[:, idx] + # get the power of this + lower_power = np.sum(lower_spectrum) + # get the ratio of LF to all energy + lower_ratio = lower_power / float(tpower) + # append to array + all_normalised_lower_ratio.append(lower_ratio) + else: + all_normalised_lower_ratio.append(0) + + # calculate + weighted_mean_normalised_lower_ratio = np.average(all_normalised_lower_ratio, weights=all_normalised_ratio_tpower) + + ''' + METRIC 3 - Approximate duration/decay-time of sample + ''' + all_my_duration = [] + + # get envelpe of signal + envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs) + # estimate onsets + onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs) + + # get RMS envelope - better follows decays than the sample-and-hold + rms_step_size = 256 + rms_envelope = timbral_util.calculate_rms_enveope(audio_samples, step_size=rms_step_size) + + # convert decay threshold to magnitude + decay_threshold = timbral_util.db2mag(db_decay_threshold) + # rescale onsets to rms stepsize - casting to int + time_convert = fs / float(rms_step_size) + onsets = (np.array(onsets) / float(rms_step_size)).astype('int') + + for idx, onset in enumerate(onsets): + if onset == onsets[-1]: + segment = rms_envelope[onset:] + else: + segment = rms_envelope[onset:onsets[idx + 1]] + + # get location of max RMS frame + max_idx = np.argmax(segment) + # get the segment from this max until the next onset + post_max_segment = segment[max_idx:] + + # estimate duration based on decay or until next onset + if min(post_max_segment) >= decay_threshold: + my_duration = len(post_max_segment) / time_convert + else: + my_duration = np.where(post_max_segment < decay_threshold)[0][0] / time_convert + + # append to array + all_my_duration.append(my_duration) + + # calculate the lof of mean duration + mean_my_duration = np.log10(np.mean(all_my_duration)) + + + ''' + METRIC 4 - f0 estimation with peak picking + ''' + # get the overall spectrum + all_spectrum = np.sum(spec, axis=1) + # normalise this + norm_spec = (all_spectrum - np.min(all_spectrum)) / (np.max(all_spectrum) - np.min(all_spectrum)) + # set limit for peak picking + cthr = 0.01 + # detect peaks + peak_idx, peak_value, peak_freq = timbral_util.detect_peaks(norm_spec, cthr=cthr, unprocessed_array=norm_spec, + freq=freq) + # estimate peak + pitch_estimate = np.log10(min(peak_freq)) if peak_freq[0] > 0 else 0 + + + # get outputs + if dev_output: + return limited_weighted_mean_normalised_lower_centroid, weighted_mean_normalised_lower_ratio, mean_my_duration, \ + pitch_estimate, weighted_mean_normalised_lower_ratio * mean_my_duration, \ + timbral_util.sigmoid(weighted_mean_normalised_lower_ratio) * mean_my_duration + else: + ''' + Perform linear regression to obtain depth + ''' + # coefficients from linear regression + coefficients = np.array([-0.0043703565847874465, 32.83743202462131, 4.750862716905235, -14.217438690256062, + 3.8782339862813924, -0.8544826091735516, 66.69534393444391]) + + # what are the best metrics + metric1 = limited_weighted_mean_normalised_lower_centroid + metric2 = weighted_mean_normalised_lower_ratio + metric3 = mean_my_duration + metric4 = pitch_estimate + metric5 = metric2 * metric3 + metric6 = timbral_util.sigmoid(metric2) * metric3 + + # pack metrics into a matrix + all_metrics = np.zeros(7) + + all_metrics[0] = metric1 + all_metrics[1] = metric2 + all_metrics[2] = metric3 + all_metrics[3] = metric4 + all_metrics[4] = metric5 + all_metrics[5] = metric6 + all_metrics[6] = 1.0 + + # perform linear regression + depth = np.sum(all_metrics * coefficients) + + if clip_output: + depth = timbral_util.output_clip(depth) + + return depth + diff --git a/build/lib/timbral_models/Timbral_Extractor.py b/build/lib/timbral_models/Timbral_Extractor.py new file mode 100644 index 0000000..18e3e7c --- /dev/null +++ b/build/lib/timbral_models/Timbral_Extractor.py @@ -0,0 +1,140 @@ +from __future__ import division +import soundfile as sf +import numpy as np +import six +from . import timbral_util, timbral_hardness, timbral_depth, timbral_brightness, timbral_roughness, timbral_warmth, \ + timbral_sharpness, timbral_booming, timbral_reverb + +def timbral_extractor(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, output_type='dictionary', verbose=True): + """ + The Timbral Extractor will extract all timbral attribute sin one function call, returning the results as either + a list or dictionary, depending on input definitions. + + Version 0.4 + + Simply calls each function with + + Required parameter + :param fname: string or numpy array + string, audio filename to be analysed, including full file path and extension. + numpy array, array of audio samples, requires fs to be set to the sample rate. + + Optional parameters + :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate. + Defaults to 0. + :param phase_correction: bool, perform phase checking before summing to mono. Defaults to False. + :param dev_output: bool, when False return the depth, when True return all extracted + features. Default to False. + :param clip_output: bool, force the output to be between 0 and 100. + :param output_type: string, defines the type the output should be formatted in. Accepts either + 'dictionary' or 'list' as parameters. Default to 'dictionary'. + + :return: timbre the results from all timbral attributes as either a dictionary or list, depending + on output_type. + + Copyright 2019 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + """ + ''' + Check output_type before calculating anything + ''' + if output_type != 'dictionary' and output_type != 'list': + raise ValueError('output_type must be \'dictionary\' or \'list\'.') + + ''' + Basic audio reading + ''' + if isinstance(fname, six.string_types): + # read audio file only once and pass arrays to algorithms + try: + audio_samples, fs = sf.read(fname) + # making an array again for copying purposes + multi_channel_audio = np.array(audio_samples) + except: + print('Soundfile failed to load: ' + str(fname)) + raise TypeError('Unable to read audio file.') + elif hasattr(fname, 'shape'): + if fs==0: + raise ValueError('If giving function an array, \'fs\' must be specified') + audio_samples = fname + multi_channel_audio = np.array(fname) + else: + raise ValueError('Input must be either a string or a numpy array.') + + # channel reduction + audio_samples = timbral_util.channel_reduction(audio_samples) + + # resample audio file if sample rate is less than 44100 + audio_samples, fs = timbral_util.check_upsampling(audio_samples, fs) + + # functions can be given audio samples as well + if verbose: + print('Calculating hardness...') + hardness = timbral_hardness(audio_samples, fs=fs, + dev_output=dev_output, + phase_correction=phase_correction, + clip_output=clip_output) + if verbose: + print('Calculating depth...') + depth = timbral_depth(audio_samples, fs=fs, + dev_output=dev_output, + phase_correction=phase_correction, + clip_output=clip_output) + if verbose: + print('Calculating brightness...') + brightness = timbral_brightness(audio_samples, fs=fs, + dev_output=dev_output, + phase_correction=phase_correction, + clip_output=clip_output) + if verbose: + print('Calculating roughness...') + roughness = timbral_roughness(audio_samples, fs=fs, + dev_output=dev_output, + phase_correction=phase_correction, + clip_output=clip_output) + if verbose: + print('Calculating warmth...') + warmth = timbral_warmth(audio_samples, fs=fs, + dev_output=dev_output, + phase_correction=phase_correction, + clip_output=clip_output) + if verbose: + print('Calculating sharpness...') + sharpness = timbral_sharpness(audio_samples, fs=fs, + dev_output=dev_output, + phase_correction=phase_correction, + clip_output=clip_output) + if verbose: + print('Calculating boominess...') + boominess = timbral_booming(audio_samples, fs=fs, + dev_output=dev_output, + phase_correction=phase_correction, + clip_output=clip_output) + if verbose: + print('Calculating reverb...') + # reverb calculated on all channels + reverb = timbral_reverb(multi_channel_audio, fs=fs) + + ''' + Format output + ''' + if output_type=='dictionary': + timbre = { + 'hardness': hardness, + 'depth': depth, + 'brightness': brightness, + 'roughness': roughness, + 'warmth': warmth, + 'sharpness': sharpness, + 'boominess': boominess, + 'reverb': reverb + } + elif output_type == 'list': + timbre = [hardness, depth, brightness, roughness, warmth, sharpness, boominess, reverb] + else: + raise ValueError('output_type must be \'dictionary\' or \'list\'.') + + + return timbre + + + diff --git a/build/lib/timbral_models/Timbral_Hardness.py b/build/lib/timbral_models/Timbral_Hardness.py new file mode 100644 index 0000000..7f782d0 --- /dev/null +++ b/build/lib/timbral_models/Timbral_Hardness.py @@ -0,0 +1,274 @@ +from __future__ import division +import numpy as np +import librosa +import soundfile as sf +import six +from scipy.signal import spectrogram +from . import timbral_util + +def timbral_hardness(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False, max_attack_time=0.1, + bandwidth_thresh_db=-75): + """ + This function calculates the apparent hardness of an audio file. + This version of timbral_hardness contains self loudness normalising methods and can accept arrays as an input + instead of a string filename. + + Version 0.4 + + Required parameter + :param fname: string or numpy array + string, audio filename to be analysed, including full file path and extension. + numpy array, array of audio samples, requires fs to be set to the sample rate. + + Optional parameters + :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate. + Defaults to 0. + :param phase_correction: bool, perform phase checking before summing to mono. Defaults to False. + :param dev_output: bool, when False return the depth, when True return all extracted + features. Default to False. + :param clip_output: bool, force the output to be between 0 and 100. + :param max_attack_time: float, set the maximum attack time, in seconds. Defaults to 0.1. + :param bandwidth_thresh_db: float, set the threshold for calculating the bandwidth, Defaults to -75dB. + + + :return: float, Apparent hardness of audio file, float (dev_output = False/default). + With dev_output set to True returns the weighted mean bandwidth, + mean attack time, harmonic-percussive ratio, and unitless attack centroid. + + Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + + ''' + Read input + ''' + audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction) + + ''' + Calculate the midband level + ''' + # get the level in the midband + midband_level, weighed_midband_level = timbral_util.weighted_bark_level(audio_samples, fs, low_bark_band=70, + upper_bark_band=140) + log_weighted_midband_level = np.log10(weighed_midband_level) + + ''' + Calculate the harmonic-percussive ratio pre zero-padding the signal + ''' + HP_ratio = timbral_util.get_percussive_audio(audio_samples, return_ratio=True) + log_HP_ratio = np.log10(HP_ratio) + + ''' + Zeropad the signal + ''' + # zero pad the signal + nperseg = 4096 # default value for spectrogram analysis + audio_samples = np.lib.pad(audio_samples, (nperseg+1, 0), 'constant', constant_values=(0.0, 0.0)) + + ''' + Calculate the envelope and onsets + ''' + # calculate the envelope of the signal + envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.1) + envelope_time = np.arange(len(envelope)) / fs + + # calculate the onsets + original_onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs, nperseg=nperseg) + onset_strength = librosa.onset.onset_strength(audio_samples, fs) + # If onsets don't exist, set it to time zero + if not original_onsets: + original_onsets = [0] + # set to start of file in the case where there is only one onset + if len(original_onsets) == 1: + original_onsets = [0] + + onsets = np.array(original_onsets) - nperseg + onsets[onsets < 0] = 0 + + ''' + Calculate the spectrogram so that the bandwidth can be created + ''' + bandwidth_step_size = 128 + mag = timbral_util.db2mag(bandwidth_thresh_db) # calculate threshold in linear from dB + bandwidth, t, f = timbral_util.get_bandwidth_array(audio_samples, fs, nperseg=nperseg, + overlap_step=bandwidth_step_size, rolloff_thresh=mag, + normalisation_method='none') + # bandwidth sample rate + bandwidth_fs = fs / float(bandwidth_step_size) # fs due to spectrogram step size + + ''' + Set all parameters for holding data per onset + ''' + all_bandwidth_max = [] + all_attack_time = [] + all_max_strength = [] + all_max_strength_bandwidth = [] + all_attack_centroid = [] + + ''' + Get bandwidth onset times and max bandwidth + ''' + bandwidth_onset = np.array(onsets / float(bandwidth_step_size)).astype('int') # overlap_step=128 + + ''' + Iterate through onsets and calculate metrics for each + ''' + for onset_count in range(len(bandwidth_onset)): + ''' + Calculate the bandwidth max for the attack portion of the onset + ''' + # get the section of the bandwidth array between onsets + onset = bandwidth_onset[onset_count] + if onset == bandwidth_onset[-1]: + bandwidth_seg = np.array(bandwidth[onset:]) + else: + next_onset = bandwidth_onset[onset_count + 1] + bandwidth_seg = np.array(bandwidth[onset:next_onset]) + + if max(bandwidth_seg) > 0: + # making a copy of the bandqwidth segment to avoid array changes + hold_bandwidth_seg = list(bandwidth_seg) + + # calculate onset of the attack in the bandwidth array + if max(bandwidth_seg) > 0: + bandwidth_attack = timbral_util.calculate_attack_time(bandwidth_seg, bandwidth_fs, + calculation_type='fixed_threshold', + max_attack_time=max_attack_time) + else: + bandwidth_attack = [] + + # calculate the badiwdth max for the attack portion + if bandwidth_attack: + start_idx = bandwidth_attack[2] + if max_attack_time > 0: + max_attack_time_samples = int(max_attack_time * bandwidth_fs) + if len(hold_bandwidth_seg[start_idx:]) > start_idx+max_attack_time_samples: + all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:start_idx+max_attack_time_samples])) + else: + all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:])) + else: + all_bandwidth_max.append(max(hold_bandwidth_seg[start_idx:])) + else: + # set as blank so bandwith + bandwidth_attack = [] + + ''' + Calculate the attack time + ''' + onset = original_onsets[onset_count] + if onset == original_onsets[-1]: + attack_seg = np.array(envelope[onset:]) + strength_seg = np.array(onset_strength[int(onset/512):]) # 512 is librosa default window size + audio_seg = np.array(audio_samples[onset:]) + else: + attack_seg = np.array(envelope[onset:original_onsets[onset_count + 1]]) + strength_seg = np.array(onset_strength[int(onset/512):int(original_onsets[onset_count+1]/512)]) + audio_seg = np.array(audio_samples[onset:original_onsets[onset_count + 1]]) + + attack_time = timbral_util.calculate_attack_time(attack_seg, fs, max_attack_time=max_attack_time) + all_attack_time.append(attack_time[0]) + + ''' + Get the attack strength for weighting the bandwidth max + ''' + all_max_strength.append(max(strength_seg)) + if bandwidth_attack: + all_max_strength_bandwidth.append(max(strength_seg)) + + ''' + Get the spectral centroid of the attack (125ms after attack start) + ''' + # identify the start of the attack + th_start_idx = attack_time[2] + # define how long the attack time can be + centroid_int_samples = int(0.125 * fs) # number of samples for attack time integration + + # start of attack section from attack time calculation + if th_start_idx + centroid_int_samples >= len(audio_seg): + audio_seg = audio_seg[th_start_idx:] + else: + audio_seg = audio_seg[th_start_idx:th_start_idx + centroid_int_samples] + + # check that there's a suitable legnth of samples to get attack centroid + # minimum length arbitrarily set to 512 samples + if len(audio_seg) > 512: + # get all spectral features for this attack section + spectral_features_hold = timbral_util.get_spectral_features(audio_seg, fs) + + # store unitless attack centroid if exists + if spectral_features_hold: + all_attack_centroid.append(spectral_features_hold[0]) + + ''' + Calculate mean and weighted average values for features + ''' + # attack time + mean_attack_time = np.mean(all_attack_time) + + # get the weighted mean of bandwidth max and limit lower value + if len(all_bandwidth_max): + mean_weighted_bandwidth_max = np.average(all_bandwidth_max, weights=all_max_strength_bandwidth) + # check for zero values so the log bandwidth max can be taken + if mean_weighted_bandwidth_max <= 512.0: + mean_weighted_bandwidth_max = fs / 512.0 # minimum value + else: + mean_weighted_bandwidth_max = fs / 512.0 # minimum value + + # take the logarithm + log_weighted_bandwidth_max = np.log10(mean_weighted_bandwidth_max) + + # get the mean of the onset strenths + mean_max_strength = np.mean(all_max_strength) + log_mean_max_strength = np.log10(mean_max_strength) + + if all_attack_centroid: + mean_attack_centroid = np.mean(all_attack_centroid) + else: + mean_attack_centroid = 200.0 + + # limit the lower limit of the attack centroid to allow for log to be taken + if mean_attack_centroid <= 200: + mean_attack_centroid = 200.0 + log_attack_centroid = np.log10(mean_attack_centroid) + + ''' + Either return the raw features, or calculaste the linear regression. + ''' + if dev_output: + return log_weighted_bandwidth_max, log_attack_centroid, log_weighted_midband_level, log_HP_ratio, log_mean_max_strength, mean_attack_time + else: + ''' + Apply regression model + ''' + all_metrics = np.ones(7) + all_metrics[0] = log_weighted_bandwidth_max + all_metrics[1] = log_attack_centroid + all_metrics[2] = log_weighted_midband_level + all_metrics[3] = log_HP_ratio + all_metrics[4] = log_mean_max_strength + all_metrics[5] = mean_attack_time + + # coefficients = np.array([13.5330599736, 18.1519030059, 13.1679266873, 5.03134507433, 5.22582123237, -3.71046018962, -89.8935449357]) + + # recalculated values when using loudnorm + coefficients = np.array([12.079781720638145, 18.52100377170042, 14.139883645260355, 5.567690321917516, + 3.9346817690405635, -4.326890461087848, -85.60352209068202]) + + hardness = np.sum(all_metrics * coefficients) + + # clip output between 0 and 100 + if clip_output: + hardness = timbral_util.output_clip(hardness) + + return hardness diff --git a/build/lib/timbral_models/Timbral_Reverb.py b/build/lib/timbral_models/Timbral_Reverb.py new file mode 100644 index 0000000..46ad373 --- /dev/null +++ b/build/lib/timbral_models/Timbral_Reverb.py @@ -0,0 +1,370 @@ +from __future__ import division +import numpy as np +import soundfile as sf +import six +from scipy.signal import spectrogram +from . import timbral_util + +def timbral_reverb(fname, fs=0, dev_output=False, phase_correction=False, clip_output=False): + """ + This function classifies the audio file as either not sounding reverberant. + + This is based on the RT60 estimation algirhtm documented in: + Jan, T., and Wang, W., 2012: "Blind reverberation time estimation based on Laplace distribution", + EUSIPCO. pp. 2050-2054, Bucharest, Romania. + + Version 0.4 + + Required parameter + :param fname: string or numpy array + string, audio filename to be analysed, including full file path and extension. + numpy array, array of audio samples, requires fs to be set to the sample rate. + + Optional parameters + :param fs: int/float, when fname is a numpy array, this is a required to be the sample rate. + Defaults to 0. + :param phase_correction: Has no effect on the code. Implemented for consistency with other timbral + functions. + :param dev_output: Has no effect on the code. Implemented for consistency with other timbral + functions. + :param clip_output: Has no effect on the code. Implemented for consistency with other timbral + functions. + + :return: predicted reverb of audio file. 1 represents the files osunds reverberant, 0 + represents the files does not sound reverberant. + + Copyright 2019 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + """ + # needs to accept the input as audio file + raw_audio_samples, fs = timbral_util.file_read(fname, fs=fs, phase_correction=False, mono_sum=False, loudnorm=False) + + # check for mono file + if len(raw_audio_samples.shape) < 2: + # it's a mono file + mean_RT60 = estimate_RT60(raw_audio_samples, fs) + else: + # the file has channels, estimate RT for the first two and take the mean + l_RT60 = estimate_RT60(raw_audio_samples[:, 0], fs) + r_RT60 = estimate_RT60(raw_audio_samples[:, 1], fs) + + mean_RT60 = np.mean([l_RT60, r_RT60]) + + ''' + need to develop a logistic regression model to test this. + ''' + probability = reverb_logistic_regression(mean_RT60) + + if dev_output: + return mean_RT60, probability + else: + if probability < 0.5: + return 0 + else: + return 1 + + +def estimate_RT60(audio_samples, fs): + + ''' No chanel rediuction, perform on each channel ''' + + # function[rt_est, par] = RT_estimation_my(y, fs) + + # performs blind RT estimation + # INPUT + # y: reverberant speech + # fs: sampling frequency + # + # OUTPUT + # rt_est: estimated RT + # par: struct with parameters used to execute the function + # rt_estimate_frame_my.m + # + # Codes were adapted from the original codes by Heinrich Loellmann, IND, RWTH Aachen + # + # Authors: Tariqullah Jan, moderated by Wenwu Wang, University of Surrey(2012) + + ''' + Initialisation + ''' + # --------------------------------------------- + + par = init_rt_estimate_e(fs) # struct with all parameters and buffers for frame-wise processing + BL = par['N'] * par['down'] # to simplify notation + + Laudio = len(audio_samples) + + # check audio file is long enough for analysis + if BL < Laudio: + rt_est = [] #np.zeros(int(round(Laudio / par['N_shift']))) + RT_final = [] + + ''' + frame-wise processing in the time-domain + ''' + # --------------------------------------------- + + k = 0 + n_array = np.arange(0, Laudio - BL + 1, par['N_shift']) + for n in n_array: + k += 1 # frame counter + ind = np.arange(n, n + BL) # indices of current frame + + # Actual RT estimation + RT, par, finalrt = rt_estimate_frame_my(audio_samples[ind[np.arange(0, len(ind), par['down'])]], par) + + rt_est.append(RT) # store estimated value + RT_final.append(finalrt) + else: + # audio too short for analysis, for returning smallest Rt value + return par['Tquant'][0] + + RT_final = np.clip(RT_final, 0, max(RT_final)) + aaa = RT_final[np.where(RT_final>0)] + + + RT_temp_new = [] + for i in range(1, len(aaa)): + RT_temp_new.append(0.49 * aaa[i - 1] + (1 - 0.49) * np.max(aaa)) + + + if aaa.size: + RTfinal_value = np.min(aaa) + + RT_temp_new = [] + for i in range(1, len(aaa)): + RT_temp_new.append(0.49 * aaa[i - 1] + (1 - 0.49) * np.max(aaa)) + + else: + RTfinal_value = par['Tquant'][0] + + rt_est = np.array(rt_est) + rt_est = rt_est[np.where(RT_final>0)] + if rt_est.size: + return np.mean(rt_est) + else: + return par['Tquant'][0] + + +def init_rt_estimate_e(fs=24000): + ''' + par = init_rt_estimate_e(fs) + executes initialization for the function + rt_estimate_frame.m to perform a blind estimation of the reverberation time + (RT) by frame-wise processing in the time-domain. + + INPUT + fs: sampling frequency(default=24 kHz) + + OUTPUT + par: struct containing all parameters and buffer for executing the + function rt_estimate_frame.m + + author: Heiner Loellmann, IND, RWTH Aachen University + + created: August 2011 + + general paraemters + ''' + par = {"fs":fs} + no = par['fs'] / 24000.0 # correction factor to account for different sampling frequency + + # pararmeters for pre - selection of suitable segments + if par['fs'] > 8e3: + par['down'] = 2 # rate for downsampling applied before RT estimation to reduce computational complexity + else: + par['down'] = 1 + + par['N_sub'] = int(round(no * 700 / par['down'])) # sub-frame length(after downsampling) + par['N_shift'] = int(round(no * 200 / par['down'])) # frame shift(before downsampling) + par['nos_min'] = 3 # minimal number of subframes to detect a sound decay + par['nos_max'] = 7 # maximal number of subframes to detect a sound decay + par['N'] = int(par['nos_max'] * par['N_sub']) # maximal frame length(after downsampling) + + # parameters for ML - estimation + Tmax = 1.1 # max RT being considered + Tmin = 0.2 #min RT being considered + par['bin'] = 0.1 # step-size for RT estimation + par['Tquant'] = np.arange(Tmin, Tmax+par['bin']/2, par['bin']) # set of qunatized RTs considered for maximum search + par['a'] = np.exp(-3.0 * np.log(10) / ( par['Tquant'] * (par['fs'] / par['down']))) # corresponding decay rate factors + par['La'] = len(par['a']) # num of considered decay rate factors( = no of.RTs) + + # paramters for histogram - based approach to reduce outliers (order statistics) + par['buffer_size'] = int(round(no * 800 / par['down'])) # buffer size + par['buffer'] = np.zeros(par['buffer_size']) # buffer with previous indices to update histogram + par['no_bins'] = int(par['La']) # no. of histogram bins + par['hist_limits'] = np.arange(Tmin - par['bin'] / 2.0, Tmax + par['bin'], par['bin']) # limits of histogram bins + par['hist_rt'] = np.zeros(par['no_bins']) # histogram with ML estimates + par['hist_counter'] = 0 # counter increased if histogram is updated + + # paramters for recursive smoothing of final RT estimate + par['alpha'] = 0.995 # smoothing factor + par['RT_initial'] = 0.3 # initial RT estimate + par['RT_last'] = par['RT_initial'] # last RT estimate + par['RT_raw'] = par['RT_initial'] # raw RT estimate obtained by histogram - approach + + return par + + +def rt_estimate_frame_my(frame, par): + ''' + performs an efficient blind estimation of the reverberation time(RT) for frame-wise + processing based on Laplacian distribution. + + INPUT + frame: (time-domain) segment with reverberant speech + par: struct with all parameters and buffers created by the function + init_binaural_speech_enhancement_e.m + + OUTPUT + RT: estimated RT + par: struct with updated buffers to enable a frame-wise processing + RT_pre: raw RT estimate(for debugging and analysis of the algorithm) + + Reference: LAllmann, H.W., Jeub, M., Yilmaz, E., and Vary, P.: + An Improved Algorithm for Blind Reverberation Time Estimation, a + International Workshop on Acoustic Echo and Noise Control(IWAENC), Tel Aviv, Israel, Aug. 2010. + + Tariqullah Jan and Wenwu Wang: + Blind reverberation time estimation based on Laplacian distribution + European Signal Processing Conference(EUSIPCO), 2012. + + The codes were adapted based on the original codes by Heinrich Loellmann, IND, RWTH Aachen + + Authors: Tariqullah Jan, moderated by Wenwu Wang, University of Surrey(2012) + ''' + if len(np.shape(np.squeeze(frame))) > 1: + raise ValueError('Something went wrong...') + + cnt = 0 # sub-frame counter for pre - selection of possible sound decay + RTml = -1 # default RT estimate (-1 indicates no new RT estimate) + + # calculate variance, minimum and maximum of first sub-frame + seg = frame[:par['N_sub']] + + var_pre = np.var(seg) + min_pre = np.min(seg) + max_pre = np.max(seg) + + for k in range(2, par['nos_max']): + # calculate variance, minimum and maximum of succeding sub-frame + seg = frame[(k-1) * par['N_sub'] : k * par['N_sub']+1] + var_cur = np.var(seg) + max_cur = max(seg) + min_cur = min(seg) + + #-- Pre-Selection of suitable speech decays -------------------- + if (var_pre > var_cur) and (max_pre > max_cur) and (min_pre < min_cur): + # if variance, maximum decraease, and minimum increase + # = > possible sound decay detected + + cnt += 1 + + # current values becomes previous values + var_pre = var_cur + max_pre = max_cur + min_pre = min_cur + + else: + if cnt >= par['nos_min']: + # minimum length for assumed sound decay achieved? + # -- Maximum Likelihood(ML) Estimation of the RT + RTml, _ = max_loglf(frame[:cnt*par['N_sub']], par['a'], par['Tquant']) + + break + + + if k == par['nos_max']: + # maximum frame length achieved? + RTml, _ = max_loglf(frame[0:cnt * par['N_sub']], par['a'], par['Tquant']) + + # end of sub-frame loop + + if RTml >= 0: # new ML estimate calculated + + # apply order statistics to reduce outliers + par['hist_counter'] += 1 + + for i in range(par['no_bins']): + + # find index corresponding to the ML estimate + # find index corresponding to the ML estimate + if (RTml >= par['hist_limits'][i]) and (RTml <= par['hist_limits'][i+1]): + + index = i + break + + # update histogram with ML estimates for the RT + par['hist_rt'][index] += 1 + + if par['hist_counter'] > par['buffer_size'] + 1: + # remove old values from histogram + par['hist_rt'][int(par['buffer'][0])] = par['hist_rt'][int(par['buffer'][0])] - 1 + + par['buffer'] = np.append(par['buffer'][1:], index) # % update buffer with indices + idx = np.argmax(par['hist_rt']) # find index for maximum of the histogram + + par['RT_raw'] = par['Tquant'][idx] # map index to RT value + + + # final RT estimate obtained by recursive smoothing + RT = par['alpha'] * par['RT_last'] + (1 - par['alpha']) * par['RT_raw'] + par['RT_last'] = RT + + RT_pre = RTml # intermediate ML estimate for later analysis + + return RT, par, RT_pre + + +def max_loglf(h, a, Tquant): + ''' + [ML, ll] = max_loglf(h, a, Tquant) + + returns the maximum of the log-likelihood(LL) function and the LL + function itself for a finite set of decay rates + + INPUT + h: input frame + a: finite set of values for which the max.should be found + T: corresponding RT values for vector a + + OUTPUT + ML: ML estimate for the RT + ll: underlying LL - function + ''' + + N = len(h) + n = np.arange(0, N) # indices for input vector + ll = np.zeros(len(a)) + + # transpose? + h_square = h.transpose() + + for i in range(len(a)): + sum1 = np.dot((a[i] ** (-1.0 * n)), np.abs(h_square)) + sum2 = np.sum(np.abs(h_square)) + sigma = (1 / N) * sum1 + ll[i] = -N * np.log(2) - N * np.log(sigma) - np.sum(np.log(a[i] ** n)) - (1 / sigma) * sum1 + + + idx = np.argmax(ll) # maximum of the log-likelihood function + ML = Tquant[idx] # corresponding ML estimate for the RT + + return ML, ll + + +def reverb_logistic_regression(mean_RT60): + """ + Logistic regression function to determine if the file sound reverberant or not. + :param mean_RT60: + :return: + """ + # apply linear coefficients + coefficients = [2.97126461] + intercept = -1.45082989 + attributes = [mean_RT60] + logit_model = np.sum(np.array(coefficients) * np.array(attributes)) + intercept + + # apply inverse of Logit function to obtain probability + probability = np.exp(logit_model) / (1.0 + np.exp(logit_model)) + + return probability \ No newline at end of file diff --git a/build/lib/timbral_models/Timbral_Roughness.py b/build/lib/timbral_models/Timbral_Roughness.py new file mode 100644 index 0000000..ba78994 --- /dev/null +++ b/build/lib/timbral_models/Timbral_Roughness.py @@ -0,0 +1,185 @@ +from __future__ import division +import numpy as np +import soundfile as sf +from . import timbral_util + + +def plomp(f1, f2): + """ + Plomp's algorithm for estimating roughness. + + :param f1: float, frequency of first frequency of the pair + :param f2: float, frequency of second frequency of the pair + :return: + """ + b1 = 3.51 + b2 = 5.75 + xstar = 0.24 + s1 = 0.0207 + s2 = 18.96 + s = np.tril(xstar / ((s1 * np.minimum(f1, f2)) + s2)) + pd = np.exp(-b1 * s * np.abs(f2 - f1)) - np.exp(-b2 * s * np.abs(f2 - f1)) + return pd + + +def timbral_roughness(fname, dev_output=False, phase_correction=False, clip_output=False, fs=0, peak_picking_threshold=0.01): + """ + This function is an implementation of the Vassilakis [2007] model of roughness. + The peak picking algorithm implemented is based on the MIR toolbox's implementation. + + This version of timbral_roughness contains self loudness normalising methods and can accept arrays as an input + instead of a string filename. + + Version 0.4 + + + Vassilakis, P. 'SRA: A Aeb-based researh tool for spectral and roughness analysis of sound signals', Proceedings + of the 4th Sound and Music Computing Conference, Lefkada, Greece, July, 2007. + + Required parameter + :param fname: string, Audio filename to be analysed, including full file path and extension. + + Optional parameters + :param dev_output: bool, when False return the roughness, when True return all extracted features + (current none). + :param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum. + Defaults to False. + + :return: Roughness of the audio signal. + + Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + ''' + Read input + ''' + audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction) + + ''' + Pad audio + ''' + # pad audio + audio_samples = np.lib.pad(audio_samples, (512, 0), 'constant', constant_values=(0.0, 0.0)) + + ''' + Reshape audio into time windows of 50ms. + ''' + # reshape audio + audio_len = len(audio_samples) + time_step = 0.05 + step_samples = int(fs * time_step) + nfft = step_samples + window = np.hamming(nfft + 2) + window = window[1:-1] + olap = nfft / 2 + num_frames = int((audio_len)/(step_samples-olap)) + next_pow_2 = np.log(step_samples) / np.log(2) + next_pow_2 = 2 ** int(next_pow_2 + 1) + + reshaped_audio = np.zeros([next_pow_2, num_frames]) + + i = 0 + start_idx = int((i * (nfft / 2.0))) + + # check if audio is too short to be reshaped + if audio_len > step_samples: + # get all the audio + while start_idx+step_samples <= audio_len: + audio_frame = audio_samples[start_idx:start_idx+step_samples] + + # apply window + audio_frame = audio_frame * window + + # append zeros + reshaped_audio[:step_samples, i] = audio_frame + + # increase the step + i += 1 + start_idx = int((i * (nfft / 2.0))) + else: + # reshaped audio is just padded audio samples + reshaped_audio[:audio_len, i] = audio_samples + + spec = np.fft.fft(reshaped_audio, axis=0) + spec_len = int(next_pow_2/2) + 1 + spec = spec[:spec_len, :] + spec = np.absolute(spec) + + freq = fs/2 * np.linspace(0, 1, spec_len) + + # normalise spectrogram based from peak TF bin + norm_spec = (spec - np.min(spec)) / (np.max(spec) - np.min(spec)) + + ''' Peak picking algorithm ''' + cthr = peak_picking_threshold # threshold for peak picking + + _, no_segments = np.shape(spec) + + allpeakpos = [] + allpeaklevel = [] + allpeaktime = [] + + for i in range(0, no_segments): + d = norm_spec[:, i] + d_un = spec[:, i] + + # find peak candidates + peak_pos, peak_level, peak_x = timbral_util.detect_peaks(d, cthr=cthr, unprocessed_array=d_un, freq=freq) + + allpeakpos.append(peak_pos) + allpeaklevel.append(peak_level) + allpeaktime.append(peak_x) + + ''' Calculate the Vasillakis Roughness ''' + allroughness = [] + # for each frame + for frame in range(len(allpeaklevel)): + frame_freq = allpeaktime[frame] + frame_level = allpeaklevel[frame] + + if len(frame_freq) > 1: + f2 = np.kron(np.ones([len(frame_freq), 1]), frame_freq) + f1 = f2.T + v2 = np.kron(np.ones([len(frame_level), 1]), frame_level) + v1 = v2.T + + X = v1 * v2 + Y = (2 * v2) / (v1 + v2) + Z = plomp(f1, f2) + rough = (X ** 0.1) * (0.5 * (Y ** 3.11)) * Z + + allroughness.append(np.sum(rough)) + else: + allroughness.append(0) + + mean_roughness = np.mean(allroughness) + + if dev_output: + return [mean_roughness] + else: + ''' + Perform linear regression + ''' + # cap roughness for low end + if mean_roughness < 0.01: + return 0 + else: + roughness = np.log10(mean_roughness) * 13.98779569 + 48.97606571545886 + if clip_output: + roughness = timbral_util.output_clip(roughness) + + return roughness + + + diff --git a/build/lib/timbral_models/Timbral_Sharpness.py b/build/lib/timbral_models/Timbral_Sharpness.py new file mode 100644 index 0000000..fce2c93 --- /dev/null +++ b/build/lib/timbral_models/Timbral_Sharpness.py @@ -0,0 +1,120 @@ +from __future__ import division +import numpy as np +import soundfile as sf +from . import timbral_util + + +def sharpness_Fastl(loudspec): + """ + Calculates the sharpness based on FASTL (1991) + Expression for weighting function obtained by fitting an + equation to data given in 'Psychoacoustics: Facts and Models' + using MATLAB basic fitting function + Original Matlab code by Claire Churchill Sep 2004 + Transcoded by Andy Pearce 2018 + """ + n = len(loudspec) + gz = np.ones(140) + z = np.arange(141,n+1) + gzz = 0.00012 * (z/10.0) ** 4 - 0.0056 * (z/10.0) ** 3 + 0.1 * (z/10.0) ** 2 -0.81 * (z/10.0) + 3.5 + gz = np.concatenate((gz, gzz)) + z = np.arange(0.1, n/10.0+0.1, 0.1) + + sharp = 0.11 * np.sum(loudspec * gz * z * 0.1) / np.sum(loudspec * 0.1) + return sharp + + +def timbral_sharpness(fname, dev_output=False, phase_correction=False, clip_output=False, fs=0): + """ + This is an implementation of the matlab sharpness function found at: + https://www.salford.ac.uk/research/sirc/research-groups/acoustics/psychoacoustics/sound-quality-making-products-sound-better/accordion/sound-quality-testing/matlab-codes + + This function calculates the apparent Sharpness of an audio file. + This version of timbral_sharpness contains self loudness normalising methods and can accept arrays as an input + instead of a string filename. + + Version 0.4 + + Originally coded by Claire Churchill Sep 2004 + Transcoded by Andy Pearce 2018 + + Required parameter + :param fname: string, audio filename to be analysed, including full file path and extension. + + Optional parameters + :param dev_output: bool, when False return the warmth, when True return all extracted features + :param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum. + Defaults to False. + :param clip_output: bool, bool, force the output to be between 0 and 100. Defaults to False. + + :return Apparent sharpness of the audio file. + + + Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + """ + ''' + Read input + ''' + audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction) + + # window the audio file into 4096 sample sections + windowed_audio = timbral_util.window_audio(audio_samples, window_length=4096) + + windowed_sharpness = [] + windowed_rms = [] + for i in range(windowed_audio.shape[0]): + samples = windowed_audio[i, :] + + # calculate the rms and append to list + windowed_rms.append(np.sqrt(np.mean(samples * samples))) + + # calculate the specific loudness + N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0) + + # calculate the sharpness if section contains audio + if N_entire > 0: + sharpness = sharpness_Fastl(N_single) + else: + sharpness = 0 + + windowed_sharpness.append(sharpness) + + # convert lists to numpy arrays for fancy indexing + windowed_rms = np.array(windowed_rms) + windowed_sharpness = np.array(windowed_sharpness) + # calculate the sharpness as the rms-weighted average of sharpness + rms_sharpness = np.average(windowed_sharpness, weights=(windowed_rms * windowed_rms)) + + # take the logarithm to better much subjective ratings + rms_sharpness = np.log10(rms_sharpness) + + if dev_output: + return [rms_sharpness] + else: + + all_metrics = np.ones(2) + all_metrics[0] = rms_sharpness + + # coefficients from linear regression + coefficients = [102.50508921364404, 34.432655185001735] + + # apply regression + sharpness = np.sum(all_metrics * coefficients) + + if clip_output: + sharpness = timbral_util.output_clip(sharpness) + + return sharpness diff --git a/build/lib/timbral_models/Timbral_Warmth.py b/build/lib/timbral_models/Timbral_Warmth.py new file mode 100644 index 0000000..b0f8a4a --- /dev/null +++ b/build/lib/timbral_models/Timbral_Warmth.py @@ -0,0 +1,263 @@ +from __future__ import division +import numpy as np +import soundfile as sf +from scipy.signal import spectrogram +import scipy.stats +from sklearn import linear_model +from . import timbral_util + + +def warm_region_cal(audio_samples, fs): + """ + Function for calculating various warmth parameters. + + :param audio_samples: numpy.array, an array of the audio samples, reques only one dimension. + :param fs: int, the sample ratr of the audio file. + + :return: four outputs: mean warmth region, weighted-average warmth region, mean high frequency level, + weighted-average high frequency level. + """ + #window the audio + windowed_samples = timbral_util.window_audio(audio_samples) + + # need to define a function for the roughness stimuli, emphasising the 20 - 40 region (of the bark scale) + min_bark_band = 10 + max_bark_band = 40 + mean_bark_band = (min_bark_band + max_bark_band) / 2.0 + array = np.arange(min_bark_band, max_bark_band) + x = timbral_util.normal_dist(array, theta=0.01, mean=mean_bark_band) + x -= np.min(x) + x /= np.max(x) + + wr_array = np.zeros(240) + wr_array[min_bark_band:max_bark_band] = x + + # need to define a second array emphasising the 20 - 40 region (of the bark scale) + min_bark_band = 80 + max_bark_band = 240 + mean_bark_band = (min_bark_band + max_bark_band) / 2.0 + array = np.arange(min_bark_band, max_bark_band) + x = timbral_util.normal_dist(array, theta=0.01, mean=mean_bark_band) + x -= np.min(x) + x /= np.max(x) + + hf_array = np.zeros(240) + hf_array[min_bark_band:max_bark_band] = x + + windowed_loud_spec = [] + windowed_rms = [] + + wr_vals = [] + hf_vals = [] + + for i in range(windowed_samples.shape[0]): + samples = windowed_samples[i, :] + N_entire, N_single = timbral_util.specific_loudness(samples, Pref=100.0, fs=fs, Mod=0) + + # append the loudness spec + windowed_loud_spec.append(N_single) + windowed_rms.append(np.sqrt(np.mean(samples * samples))) + + wr_vals.append(np.sum(wr_array * N_single)) + hf_vals.append(np.sum(hf_array * N_single)) + + mean_wr = np.mean(wr_vals) + mean_hf = np.mean(hf_vals) + weighted_wr = np.average(wr_vals, weights=windowed_rms) + weighted_hf = np.average(hf_vals, weights=windowed_rms) + + return mean_wr, weighted_wr, mean_hf, weighted_hf + + +def timbral_warmth(fname, dev_output=False, phase_correction=False, clip_output=False, max_FFT_frame_size=8192, + max_WR = 12000, fs=0): + """ + This function estimates the perceptual Warmth of an audio file. + + This model of timbral_warmth contains self loudness normalising methods and can accept arrays as an input + instead of a string filename. + + Version 0.4 + + Required parameter + :param fname: string, Audio filename to be analysed, including full file path and extension. + + Optional parameters + :param dev_output: bool, when False return the warmth, when True return all extracted features in a + list. + :param phase_correction: bool, if the inter-channel phase should be estimated when performing a mono sum. + Defaults to False. + :param max_FFT_frame_size: int, Frame size for calculating spectrogram, default to 8192. + :param max_WR: float, maximun allowable warmth region frequency, defaults to 12000. + + :return: Estimated warmth of audio file. + + Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + """ + ''' + Read input + ''' + audio_samples, fs = timbral_util.file_read(fname, fs, phase_correction=phase_correction) + + # get the weighted high frequency content + mean_wr, _, _, weighted_hf = warm_region_cal(audio_samples, fs) + + # calculate the onsets + envelope = timbral_util.sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.1) + envelope_time = np.arange(len(envelope)) / float(fs) + + # calculate the onsets + nperseg = 4096 + original_onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs, nperseg=nperseg) + # If onsets don't exist, set it to time zero + if not original_onsets: + original_onsets = [0] + # set to start of file in the case where there is only one onset + if len(original_onsets) == 1: + original_onsets = [0] + ''' + Initialise lists for storing features + ''' + # set defaults for holding + all_rms = [] + all_ratio = [] + all_SC = [] + all_WR_Ratio = [] + all_decay_score = [] + + + # calculate metrics for each onset + for idx, onset in enumerate(original_onsets): + if onset == original_onsets[-1]: + # this is the last onset + segment = audio_samples[onset:] + else: + segment = audio_samples[onset:original_onsets[idx+1]] + + segment_rms = np.sqrt(np.mean(segment * segment)) + all_rms.append(segment_rms) + + # get FFT of signal + segment_length = len(segment) + if segment_length < max_FFT_frame_size: + freq, time, spec = spectrogram(segment, fs, nperseg=segment_length, nfft=max_FFT_frame_size) + else: + freq, time, spec = spectrogram(segment, fs, nperseg=max_FFT_frame_size, nfft=max_FFT_frame_size) + + # flatten the audio to 1 dimension. Catches some strange errors that cause crashes + if spec.shape[1] > 1: + spec = np.sum(spec, axis=1) + spec = spec.flatten() + + # normalise for this onset + spec = np.array(list(spec)).flatten() + this_shape = spec.shape + spec /= max(abs(spec)) + + ''' + Estimate of fundamental frequency + ''' + # peak picking algorithm + peak_idx, peak_value, peak_x = timbral_util.detect_peaks(spec, freq=freq, fs=fs) + # find lowest peak + fundamental = np.min(peak_x) + fundamental_idx = np.min(peak_idx) + + ''' + Warmth region calculation + ''' + # estimate the Warmth region + WR_upper_f_limit = fundamental * 3.5 + if WR_upper_f_limit > max_WR: + WR_upper_f_limit = 12000 + tpower = np.sum(spec) + WR_upper_f_limit_idx = int(np.where(freq > WR_upper_f_limit)[0][0]) + + if fundamental < 260: + # find frequency bin closest to 260Hz + top_level_idx = int(np.where(freq > 260)[0][0]) + # sum energy up to this bin + low_energy = np.sum(spec[fundamental_idx:top_level_idx]) + # sum all energy + tpower = np.sum(spec) + # take ratio + ratio = low_energy / float(tpower) + else: + # make exception where fundamental is greater than + ratio = 0 + + all_ratio.append(ratio) + + ''' + Spectral centroid of the segment + ''' + # spectral centroid + top = np.sum(freq * spec) + bottom = float(np.sum(spec)) + SC = np.sum(freq * spec) / float(np.sum(spec)) + all_SC.append(SC) + + ''' + HF decay + - linear regression of the values above the warmth region + ''' + above_WR_spec = np.log10(spec[WR_upper_f_limit_idx:]) + above_WR_freq = np.log10(freq[WR_upper_f_limit_idx:]) + np.ones_like(above_WR_freq) + metrics = np.array([above_WR_freq, np.ones_like(above_WR_freq)]) + + # create a linear regression model + model = linear_model.LinearRegression(fit_intercept=False) + model.fit(metrics.transpose(), above_WR_spec) + decay_score = model.score(metrics.transpose(), above_WR_spec) + all_decay_score.append(decay_score) + + + ''' + get mean values + ''' + mean_SC = np.log10(np.mean(all_SC)) + mean_decay_score = np.mean(all_decay_score) + weighted_mean_ratio = np.average(all_ratio, weights=all_rms) + + if dev_output: + return mean_SC, weighted_hf, mean_wr, mean_decay_score, weighted_mean_ratio + else: + + ''' + Apply regression model + ''' + all_metrics = np.ones(6) + all_metrics[0] = mean_SC + all_metrics[1] = weighted_hf + all_metrics[2] = mean_wr + all_metrics[3] = mean_decay_score + all_metrics[4] = weighted_mean_ratio + + coefficients = np.array([-4.464258317026696, + -0.08819320850778556, + 0.29156539973575546, + 17.274733561081554, + 8.403340066029507, + 45.21212125085579]) + + warmth = np.sum(all_metrics * coefficients) + + # clip output between 0 and 100 + if clip_output: + warmth = timbral_util.output_clip(warmth) + + return warmth diff --git a/build/lib/timbral_models/__init__.py b/build/lib/timbral_models/__init__.py new file mode 100644 index 0000000..0633d7f --- /dev/null +++ b/build/lib/timbral_models/__init__.py @@ -0,0 +1,10 @@ +from .Timbral_Brightness import timbral_brightness +from .Timbral_Depth import timbral_depth +from .Timbral_Hardness import timbral_hardness +from .Timbral_Roughness import timbral_roughness +from .Timbral_Warmth import timbral_warmth +from .Timbral_Sharpness import timbral_sharpness +from .Timbral_Booming import timbral_booming +from .Timbral_Reverb import timbral_reverb +from .Timbral_Extractor import timbral_extractor +from .timbral_util import * diff --git a/build/lib/timbral_models/timbral_util.py b/build/lib/timbral_models/timbral_util.py new file mode 100644 index 0000000..30a209e --- /dev/null +++ b/build/lib/timbral_models/timbral_util.py @@ -0,0 +1,1816 @@ +from __future__ import division, print_function +import numpy as np +import librosa +import soundfile as sf +from scipy.signal import butter, lfilter, spectrogram +import scipy.stats +import pyloudnorm as pyln +import six + +""" + The timbral util is a collection of functions that can be accessed by the individual timbral models. These can be + used for extracting features or manipulating the audio that are useful to multiple attributes. + + Version 0.4 + + Copyright 2018 Andy Pearce, Institute of Sound Recording, University of Surrey, UK. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + + +def db2mag(dB): + """ + Converts from dB to linear magnitude. + + :param dB: dB level to be converted. + :return: linear magnitude of the dB input. + """ + mag = 10 ** (dB / 20.0) + return mag + + +def get_percussive_audio(audio_samples, return_ratio=True): + """ + Gets the percussive comonent of the audio file. + Currently, the default values for harmonic/percussive decomposition have been used. + Future updates may change the defaults for better separation or to improve the correlation to subjective data. + + :param audio_samples: The audio samples to be harmonicall/percussively separated + :param return_ratio: Determins the value returned by the function. + + :return: If return_ratio is True (default), the ratio of percussive energy is returned. + If False, the function returns the percussive audio as a time domain array. + """ + # use librosa decomposition + D = librosa.core.stft(audio_samples) + H, P = librosa.decompose.hpss(D) + + # inverse transform to get time domain arrays + percussive_audio = librosa.core.istft(P) + harmonic_audio = librosa.core.istft(H) + + if return_ratio: + # frame by frame RMS energy + percussive_energy = calculate_rms_enveope(percussive_audio, step_size=1024, overlap_step=512, normalise=False) + harmonic_energy = calculate_rms_enveope(harmonic_audio, step_size=1024, overlap_step=512, normalise=False) + + # set defaults for storing the data + ratio = [] + t_power = [] + + #get the ratio for each RMS time frame + for i in range(len(percussive_energy)): + if percussive_energy[i] != 0 or harmonic_energy[i] != 0: + # if percussive_energy[i] != 0 and harmonic_energy[i] != 0: + ratio.append(percussive_energy[i] / (percussive_energy[i] + harmonic_energy[i])) + t_power.append((percussive_energy[i] + harmonic_energy[i])) + + if t_power: + # take a weighted average of the ratio + ratio = np.average(ratio, weights=t_power) + return ratio + else: + # return the percussive audio when return_ratio is False + return percussive_audio + + +def filter_audio_highpass(audio_samples, crossover, fs, order=2): + """ Calculate and apply a high-pass filter, with a -3dB point of crossover. + + :param audio_samples: data to be filtered as an array. + :param crossover: the crossover frequency of the filter. + :param fs: the sampling frequency of the audio file. + :param order: order of the filter, defaults to 2. + + :return: filtered array. + """ + nyq = 0.5 * fs + xfreq = crossover / nyq + b, a = butter(order, xfreq, 'high') + y = lfilter(b, a, audio_samples) + return y + + +def filter_audio_lowpass(audio_samples, crossover, fs, order=2): + """ Calculate and apply a low-pass filter, with a -3dB point of crossover. + + :param audio_samples: data to be filtered as an array. + :param crossover: the crossover frequency of the filter. + :param fs: the sampling frequency of the audio file. + :param order: order of the filter, defaults to 2. + + :return: filtered array. + """ + nyq = 0.5 * fs + xfreq = crossover / nyq + b, a = butter(order, xfreq, 'low') + y = lfilter(b, a, audio_samples) + return y + + +def butter_bandpass(lowcut, highcut, fs, order=2): + """ Design a butterworth bandpass filter """ + nyq = 0.5 * fs + low = lowcut / nyq + high = highcut / nyq + b, a = butter(order, [low, high], btype='band') + return b, a + + +def filter_audio_bandpass(audio_samples, f0, noct, fs, order=2): + """ Calculate and apply an n/octave butterworth bandpass filter, centred at f0 Hz. + + :param audio_samples: the audio file as an array + :param fs: the sampling frequency of the audio file + :param f0: the centre frequency of the bandpass filter + :param bandwidth: the bandwidth of the filter + :param order: order of the filter, defaults to 2 + + :return: audio file filtered + """ + fd = 2 ** (1.0 / (noct * 2)) + lowcut = f0 / fd + highcut = f0 * fd + + b, a = butter_bandpass(lowcut, highcut, fs, order=order) + y = lfilter(b, a, audio_samples) + return y + + +def return_loop(onset_loc, envelope, function_time_thresh, hist_threshold, hist_time_samples, nperseg=512): + """ This function is used by the calculate_onsets method. + This looks backwards in time from the attack time and attempts to find the exact onset point by + identifying the point backwards in time where the envelope no longer falls. + This function includes a hyteresis to account for small deviations in the attack due to the + envelope calculation. + + Function looks 10ms (function_time_thresh) backwards from the onset time (onset_loc), looking for any sample + lower than the current sample. This repeats, starting at the minimum value until no smaller value is found. + Then the function looks backwards over 200ms, checking if the increase is greater than 10% of the full envelope's + dynamic range. + + onset_loc: The onset location estimated by librosa (converted to time domain index) + envelope: Envelope of the audio file + function_time_thresh: Time threshold for looking backwards in time. Set in the timbral_hardness code + to be the number of samples that equates to 10ms + hist_threshold: Level threshold to check over 200ms if the peak is small enough to continue looking + backwards in time. + hist_time_samples: Number of samples to look back after finding the minimum value over 10ms, set to 200ms. + """ + + # define flag for exiting while loop + found_start = False + + while not found_start: + # get the current sample value + current_sample = envelope[int(onset_loc)] + # get the previous 10ms worth of samples + if onset_loc - function_time_thresh > 0: + evaluation_array = envelope[onset_loc - function_time_thresh - 1:onset_loc] + else: + evaluation_array = envelope[:onset_loc - 1] + + if min(evaluation_array) - current_sample <= 0: + ''' + If the minimum value within previous 10ms is less than current sample, + move to the start position to the minimum value and look again. + ''' + min_idx = np.argmin(evaluation_array) + new_onset_loc = min_idx + onset_loc - function_time_thresh - 1 + + if new_onset_loc > nperseg: + onset_loc = new_onset_loc + else: + ''' Current index is close to start of the envelope, so exit with the idx as 512 ''' + return 0 + + else: + ''' + If the minimum value within previous 10ms is greater than current sample, + introduce the time and level hysteresis to check again. + ''' + # get the array of 200ms previous to the current onset idx + if (onset_loc - hist_time_samples - 1) > 0: + hyst_evaluation_array = envelope[onset_loc - hist_time_samples - 1:onset_loc] + else: + hyst_evaluation_array = envelope[:onset_loc] + + # values less than current sample + all_match = np.where(hyst_evaluation_array < envelope[onset_loc]) + + # if no minimum was found within the extended time, exit with current onset idx + if len(all_match[0]) == 0: + return onset_loc + + # get the idx of the closest value which is lower than the current onset idx + last_min = all_match[0][-1] + last_idx = int(onset_loc - len(hyst_evaluation_array) + last_min) + + # get the dynamic range of this segment + segment_dynamic_range = max(hyst_evaluation_array[last_min:]) - min(hyst_evaluation_array[last_min:]) + + # compare this dynamic range against the hyteresis threshold + if segment_dynamic_range >= hist_threshold: + ''' + The dynamic range is greater than the threshold, therefore this is a separate audio event. + Return the current onset idx. + ''' + return onset_loc + else: + ''' + The dynamic range is less than the threshold, therefore this is not a separate audio event. + Set current onset idx to minimum value and repeat. + ''' + if last_idx >= nperseg: + onset_loc = last_idx + else: + ''' + The hysteresis check puts the new threshold too close to the start + ''' + return 0 + + +def sample_and_hold_envelope_calculation(audio_samples, fs, decay_time=0.2, hold_time=0.01): + """ + Calculates the envelope of audio_samples with a 'sample and hold' style function. + This ensures that the minimum attack time is not limited by low-pass filtering, + a common method of obtaining the envelope. + + :param audio_samples: audio array + :param fs: sampling frequency + :param decay_time: decay time after peak hold + :param hold_time: hold time when identifying a decay + + :return: envelope of audio_samples + """ + # rectify the audio signal + abs_samples = abs(audio_samples) + envelope = [] + + # set parameters for envelope function + decay = max(abs_samples) / (decay_time * fs) # decay rate relative to peak level of audio signal + hold_samples = hold_time * fs # number of samples to hold before decay + hold_counter = 0 + previous_sample = 0.0 + + # perform the sample, hold, and decay function to obtain envelope + for sample in abs_samples: + if sample >= previous_sample: + envelope.append(sample) + previous_sample = sample + hold_counter = 0 + else: + # check hold length + if hold_counter < hold_samples: + hold_counter += 1 + envelope.append(previous_sample) + else: + out = previous_sample - decay + if out > sample: + envelope.append(out) + previous_sample = out + else: + envelope.append(sample) + previous_sample = sample + + # convert to numpy array + return np.array(envelope) + + +def get_spectral_features(audio, fs, lf_limit=20, scale='hz', cref=27.5, power=2, window_type='none', + rollon_thresh=0.05): + """ + This function calculates the spectral centroid and spectral spread of an audio array. + + :param audio: Audio array + :param fs: Sample rate of audio file + :param lf_limit: Low frequency limit, in Hz, to be analysed. Defaults to 20Hz. + :param scale: The frequency scale that calculations should be made over. if no argument is given, this + defaults to 'hz', representing a linear frequency scale. Options are 'hz', 'mel', 'erb', + or 'cents'. + :param cref: The reference frequency for calculating cents. Defaults to 27.5Hz. + :param power: The power to raise devaition from specteal centroid, defaults to 2. + + :return: Returns the spectral centroid, spectral spread, and unitless centroid. + """ + # use a hanning window + if window_type == 'hann': + window = np.hanning(len(audio)) + elif window_type == 'none': + window = np.ones(len(audio)) + else: + raise ValueError('Window type must be set to either \'hann\' or \'none\'') + + next_pow_2 = int(pow(2, np.ceil(np.log2(len(window))))) + # get frequency domain representation + spectrum = np.fft.fft((window * audio), next_pow_2) + spectrum = np.absolute(spectrum[0:int(len(spectrum) / 2) + 1]) + + tpower = np.sum(spectrum) + + if tpower > 0: + freq = np.arange(0, len(spectrum), 1) * (fs / (2.0 * (len(spectrum) - 1))) + + # find lowest frequency index, zeros used to unpack result + lf_limit_idx = np.where(freq >= lf_limit)[0][0] + spectrum = spectrum[lf_limit_idx:] + freq = freq[lf_limit_idx:] + + # convert frequency to desired frequency scale + if scale == 'hz': + freq = freq + elif scale == 'mel': + freq = 1127.0 * np.log(1 + (freq / 700.0)) + elif scale == 'erb': + freq = 21.4 * np.log10(1 + (0.00437 * freq)) + elif freq == 'cents': + # for cents, a + freq = 1200.0 * np.log2((freq / cref) + 1.0) + else: + raise ValueError('Frequency scale type not recognised. Please use \'hz\', \'mel\', \'erb\', or \'cents\'.') + + # calculate centroid and spread + centroid = sum(spectrum * freq) / float(sum(spectrum)) + + # old calculation of spread + deviation = np.abs(freq - centroid) + spread = np.sqrt(np.sum((deviation ** 2) * spectrum) / np.sum(spectrum)) + + # new calculation of spread according to librosa + # spread = np.sqrt(np.sum(spectrum * (deviation ** power))) #** (1. / power)) + + cumulative_spectral_power = spectrum[0] + counter = 0 + rollon_threshold = np.sum(spectrum) * rollon_thresh + while cumulative_spectral_power < rollon_threshold: + counter += 1 + cumulative_spectral_power = np.sum(spectrum[:counter]) + + if counter == 0: + counter = 1 + + rollon_frequency = freq[counter] + unitless_centroid = centroid / rollon_frequency + + return centroid, spread, unitless_centroid + else: + return 0 + + +def calculate_attack_time(envelope_samples, fs, calculate_attack_segment=True, thresh_no=8, normalise=True, m=3, + calculation_type='min_effort', gradient_calulation_type='all', return_descriptive_data=False, + max_attack_time=-1): + """ + Calculate the attack time from the envelope of a signal. + + Required inputs + :param envelope_samples: envelope of the audio file, suggested to be calculated with + sample_and_hold_envelope_calculation. + :param fs: sample rate of the envelope_samples. + + Optional inputs + :param calculate_attack_segment: If the attack segment of the onset should be calculated before estimating the + attack time. bool, default to True. + :param thresh_no: Number of thresholds used for calculating the minimum effort method. + int, default to 8. + :param m: value used for computation of minimum effort thresholds, defaults to 3 as s + uggested in the CUIDADO project. + :param calculation_type: method for calculating the attack time, options are 'min_effort' or + 'fixed_threshold', default to 'min_effort'. + :param gradient_calulation_type: Method for calculating the gradient of the attack, options are 'all' for + calculating the gradient from the estimated start and end points, or 'mean' for + calculating the mean gradient between each threshold step in the minimum effort + method. Defaults to 'all' and will revert to 'all' if mean is not available. + :param normalise: Normalise the attack segment. bool, default to True. + :param return_descriptive_data Default to False, if set to True also returns the thresholds for calculating + the min_effort method. + :param max_attack_time: sets the maximum allowable attack time. Defaults to -1, indicating that there + is no maximum attack time. This value should be set in seconds. + + :return: returns the attack_time, attack_gradient, index of the attack start, and + temporal centroid. + """ + if normalise: + # normalise the segments + normalise_factor = float(max(envelope_samples)) + envelope_samples /= normalise_factor + + if calculate_attack_segment: + # identify pre-attack segment + peak_idx = np.argmax(envelope_samples) + if peak_idx == 0: + # exit on error + return 0 + # min_pre_peak_idx = np.argmin(envelope_samples[:peak_idx]) + min_pre_peak_idx = np.where(envelope_samples[:peak_idx] == min(envelope_samples[:peak_idx]))[-1][-1] + + # redefine the envelope samples as just the min to the peak + envelope_samples = envelope_samples[min_pre_peak_idx:peak_idx + 1] + else: + min_pre_peak_idx = 0 + + # calculate the appropriate start and end of the attack using the selected method + if calculation_type == 'min_effort': + # get threshold time array + threshold_step = 1.0 / (thresh_no + 2) # +2 is to ignore the 0 and 100% levels. + dyn_range = max(envelope_samples) - min(envelope_samples) + thresh_level = np.linspace(threshold_step, (1 - threshold_step), thresh_no + 1) + thresh_level = (thresh_level * dyn_range) + min(envelope_samples) + + # predefine an array for when each threshold is crossed + threshold_idxs = np.zeros(thresh_no + 1) + + # get indexes for when threshold is crossed + for j in range(len(thresh_level)): + threshold_hold = np.argmax(envelope_samples >= thresh_level[j]) + # threshold_idxs[j] = threshold_hold + min_pre_peak_idx + threshold_idxs[j] = threshold_hold + + # calculate effort values (distances between thresholds) + effort = np.diff(threshold_idxs) + + # get the mean effort value + effort_mean = np.mean(effort) + effort_threshold = effort_mean * m + + # find start and stop times foxr the attack + th_start = np.argmax(effort <= effort_threshold) + + # need to use remaining effort values + effort_hold = effort[th_start:] + th_end = np.argmax(effort_hold >= effort_threshold) # this returns a 0 if value not found + if th_end == 0: + th_end = len(effort_hold) - 1 # make equal to the last value + + # apply correction for holding the values + th_end = th_end + th_start + + # get the actual start and stop index + th_start_idx = threshold_idxs[th_start] + th_end_idx = threshold_idxs[th_end] + + if th_start_idx == th_end_idx: + th_start_idx = threshold_idxs[0] + th_end_idx = threshold_idxs[-1] + + if th_start_idx == th_end_idx: + attack_time = 1.0 / fs + else: + attack_time = (th_end_idx - th_start_idx + 1.0) / fs + + if max_attack_time > 0: + if attack_time > max_attack_time: + # how many samples is equivalent to the maximum? + max_attack_time_sample = int(fs * max_attack_time) # convert to integer + th_end_idx = th_start_idx + max_attack_time_sample + attack_time = (th_end_idx - th_start_idx + 1.0) / fs + + start_level = envelope_samples[int(th_start_idx)] + end_level = envelope_samples[int(th_end_idx)] + + # specify exceptions for a step functions crossing both thresholds + if start_level == end_level: + if th_start_idx > 0: + # if a previous sample is avaiable, take the previous starting sample + start_level = envelope_samples[int(th_start_idx) - 1] + else: + # set start level to zero if onset is at the first sample (indicating a step function at time zero) + start_level = 0.0 + + # is there enough data to calculate the mean + if gradient_calulation_type == 'mean': + if (end_level - start_level) < 0.2 or (th_end_idx - th_start_idx) < 2: + # force calculation type to all + gradient_calulation_type = 'all' + print('unable to calculate attack gradient with the \'mean\' method, reverting to \'all\' method.') + + if gradient_calulation_type == 'mean': + # calculate the gradient based on the weighted mean of each attack + threshold_step = dyn_range / (thresh_no + 2) + + gradient_thresh_array = np.arange(start_level, end_level + (threshold_step * dyn_range), + (threshold_step * dyn_range)) + cross_threshold_times = np.zeros(len(gradient_thresh_array)) + cross_threshold_values = np.zeros(len(gradient_thresh_array)) + gradient_envelope_segment = envelope_samples[th_start_idx:th_end_idx + 1] + + for i in range(len(cross_threshold_values)): + hold = np.argmax(gradient_envelope_segment >= gradient_thresh_array[i]) + cross_threshold_times[i] = hold[0] / float(fs) + cross_threshold_values[i] = gradient_envelope_segment[hold[0]] + + pente_v = np.diff(cross_threshold_values) / np.diff(cross_threshold_times) + + # calculate weighted average of all gradients with a gausian dsitribution + m_threshold = 0.5 * (gradient_thresh_array[:-1] + gradient_thresh_array[1:]) + weight_v = np.exp(-(m_threshold - 0.5) ** 2 / (0.5 ** 2)) + + attack_gradient = np.sum(pente_v * weight_v) / np.sum(weight_v) + + elif gradient_calulation_type == 'all': + # calculate the attack gradient from th_start_idx to th_end_idx + attack_gradient = (end_level - start_level) / attack_time + + ''' + More stuff to return if we want extra information to be displayed + ''' + thresholds_to_return = [calculation_type, th_start_idx + min_pre_peak_idx, th_end_idx + min_pre_peak_idx, + threshold_idxs + min_pre_peak_idx] + + elif calculation_type == 'fixed_threshold': + # set threshold values for fixed threshold method + fixed_threshold_start = 20 + fixed_threshold_end = 90 + + # get dynamic range + dyn_range = max(envelope_samples) - min(envelope_samples) + + # get thresholds relative to envelope level + lower_threshold = (fixed_threshold_start * dyn_range * 0.01) + min(envelope_samples) + upper_threshold = (fixed_threshold_end * dyn_range * 0.01) + min(envelope_samples) + + # calculate start index + th_start_idx = np.argmax(envelope_samples >= lower_threshold) + # th_start_idx = th_start_idx[0] + + # find the end idx after the start idx + th_end_idx = np.argmax(envelope_samples[th_start_idx:] >= upper_threshold) + th_end_idx = th_end_idx + th_start_idx + + if th_start_idx == th_end_idx: + attack_time = 1.0 / fs + else: + attack_time = (th_end_idx - th_start_idx + 1.0) / fs + + # compare attack time to maximum permissible attack time + if max_attack_time > 0: + if attack_time > max_attack_time: + # how many samples is equivalent to the maximum? + max_attack_time_sample = int(fs * max_attack_time) # convert to integer + th_end_idx = th_start_idx + max_attack_time_sample + attack_time = (th_end_idx - th_start_idx + 1.0) / fs + + # calculate the gradient + + # find the level of the first sample used + start_level = envelope_samples[int(th_start_idx)] + # find the level of the last sample used + end_level = envelope_samples[int(th_end_idx)] + + # specify exceptions for a step functions crossing both thresholds + if start_level == end_level: + if th_start_idx > 0: + # if a previous sample is avaiable, take the previous starting sample + start_level = envelope_samples[int(th_start_idx) - 1] + else: + # set start level to zero if onset is at the first sample (indicating a step function at time zero) + start_level = 0.0 + + attack_gradient = (end_level - start_level) / attack_time + + ''' + More details to be returned if desired + ''' + thresholds_to_return = [calculation_type, th_start_idx + min_pre_peak_idx, th_end_idx + min_pre_peak_idx] + + else: + raise ValueError('calculation_type must be set to either \'fixed_threshold\' or \'min_effort\'.') + + # convert attack time to logarithmic scale + attack_time = np.log10(attack_time) + + # revert attack gradient metric if envelope has been normalised + if normalise: + attack_gradient *= normalise_factor + + ''' + Calculate the temporal centroid + ''' + hold_env = envelope_samples[int(th_start_idx):int(th_end_idx) + 1] + t = np.arange(0, len(hold_env)) / float(fs) + temp_centroid = np.sum(t * hold_env) / np.sum(hold_env) + temp_centroid /= float(len(hold_env)) + + if return_descriptive_data: + return attack_time, attack_gradient, int(th_start_idx + min_pre_peak_idx), temp_centroid, thresholds_to_return + else: + return attack_time, attack_gradient, int(th_start_idx + min_pre_peak_idx), temp_centroid + + +def calculate_onsets(audio_samples, envelope_samples, fs, look_back_time=20, hysteresis_time=300, hysteresis_percent=10, + onset_in_noise_threshold=10, minimum_onset_time_separation=100, nperseg=512): + """ + Calculates the onset times using a look backwards recursive function to identify actual note onsets, and weights + the outputs based on the onset strength to avoid misidentifying onsets. + + Required inputs + :param audio_samples: the audio file in the time domain. + :param envelope_samples: the envelope of the audio file, suggested to be calculated with + sample_and_hold_envelope_calculation. + :param fs: samplerate of the audio file. Function assumes the same sample rate for + both audio_samples and envelop_samples + + Optional inputs + :param look_back_time: time in ms to recursively lookbackwards to identify start of onset, + defaults to 20ms. + :param hysteresis_time: time in ms to look backwards in time for a hysteresis check, + set to 300ms bedefault. + :param hysteresis_percent: set the percentage of dynamic range that must be checked when looking + backwards via hysteresis, default to 10%. + :param onset_in_noise_threshold: set a threshold of dynamic range for determining if an onset was variation + in noise or an actual onset, default to 10%. + :param minimum_onset_time_separation: set the minimum time in ms that two offsets can be separated by. + :param method: set the method for calculating the onsets. Default to 'librosa', but can + be 'essentia_hfc', or 'essentia_complex'. + :param nperseg: value used in return loop. + + :return: thresholded onsets, returns [0] if no onsets are identified. Note that a + value of [0] is also possible during normal opperation. + """ + # get onsets with librosa estimation + onsets = librosa.onset.onset_detect(audio_samples, fs, backtrack=True, units='samples') + + # set values for return_loop method + time_thresh = int(look_back_time * 0.001 * fs) # 10 ms default look-back time, in samples + hysteresis_samples = int(hysteresis_time * fs * 0.001) # hysteresis time, in samples + envelope_dyn_range = max(envelope_samples) - min(envelope_samples) + hysteresis_thresh = envelope_dyn_range * hysteresis_percent * 0.01 + + # only conduct analysis if there are onsets detected + if np.size(onsets): + # empty array for storing exact onset idxs + corrected_onsets = [] + + for onset_idx in onsets: + # if the onset is 1 or 0, it's too close to the start to be corrected (1 is here due to zero padding) + if onset_idx > 0: + # actual onset location in samples (librosa uses 512 window size by default) + onset_loc = np.array(onset_idx).astype('int') + + # only calculate if the onset is NOT at the end of the file, whilst other onsets exist. + # If the only onset is at the end, calculate anyway. + if not corrected_onsets: + onset_hold = return_loop(onset_loc, envelope_samples, time_thresh, hysteresis_thresh, + hysteresis_samples, nperseg=nperseg) + corrected_onsets.append(onset_hold) + else: + if (onset_loc + 511) < len(envelope_samples): + onset_hold = return_loop(onset_loc, envelope_samples, time_thresh, hysteresis_thresh, + hysteresis_samples, nperseg=nperseg) + corrected_onsets.append(onset_hold) + else: + corrected_onsets.append(0) + + # zero is returned from return_loop if no valid onset identified + # remove zeros (except the first) + zero_loc = np.where(np.array(corrected_onsets) == 0)[0] + # ignore if the first value is zero + if list(zero_loc): + if zero_loc[0] == 0: + zero_loc = zero_loc[1:] + corrected_onsets = np.delete(corrected_onsets, zero_loc) + + # remove duplicates + hold_onsets = [] + for i in corrected_onsets: + if i not in hold_onsets: + hold_onsets.append(i) + corrected_onsets = hold_onsets + + ''' + Remove repeated onsets and compare onset segments against the dynamic range + to remove erroneous onsets in noise. If the onset segment (samples between + adjacent onsets) has a dynamic range less than 10% of total dynamic range, + remove this onset. + ''' + if len(corrected_onsets) > 1: + thd_corrected_onsets = [] + last_value = corrected_onsets[-1] + threshold = onset_in_noise_threshold * envelope_dyn_range * 0.01 + + for i in reversed(range(len(corrected_onsets))): + if corrected_onsets[i] == corrected_onsets[-1]: + segment = envelope_samples[corrected_onsets[i]:] + else: + segment = envelope_samples[corrected_onsets[i]:corrected_onsets[i + 1]] + + # only conduct if the segment if greater than 1 sample long + if len(segment) > 1: + # find attack portion SNR + peak_idx = np.argmax(segment) + if peak_idx > 0: + # get the dynamic range of the attack portion + seg_dyn_range = max(segment) - min(segment[:peak_idx]) + if seg_dyn_range >= threshold: + pass + else: + corrected_onsets = np.delete(corrected_onsets, i) + else: + corrected_onsets = np.delete(corrected_onsets, i) + else: + corrected_onsets = np.delete(corrected_onsets, i) + + # remove onsets that are too close together, favouring the earlier onset + if len(corrected_onsets) > 1: + minimum_onset_time_separation_samples = fs * 0.001 * minimum_onset_time_separation + time_separation = np.diff(corrected_onsets) + # while loop for potential multiple itterations + while len(corrected_onsets) > 1 and min(time_separation) < minimum_onset_time_separation_samples: + onsets_to_remove = [] + # some onsets are closer together than the minimum value + for i in range(len(corrected_onsets)-1): + # are the last two onsets too close? + if abs(corrected_onsets[i+1] - corrected_onsets[i]) < minimum_onset_time_separation_samples: + onsets_to_remove.append(i+1) + + # remove onsets too close together + corrected_onsets = np.delete(corrected_onsets, onsets_to_remove) + time_separation = np.diff(corrected_onsets) + + ''' + Correct onsets by comparing to the onset strength. + + If there in an onset strength of 3 or greater between two onsets, then the onset if valid. + Otherwise, discard the onset. + ''' + thd_corrected_onsets = [] + + # get the onset strength + onset_strength = librosa.onset.onset_strength(audio_samples, fs) + + strength_onset_times = np.array(np.array(corrected_onsets) / 512).astype('int') + strength_onset_times.clip(min=0) + + corrected_original_onsets = [] + corrected_strength_onsets = [] + for onset_idx in reversed(range(len(corrected_onsets))): + current_strength_onset = strength_onset_times[onset_idx] + if current_strength_onset == strength_onset_times[-1]: + onset_strength_seg = onset_strength[current_strength_onset:] + else: + onset_strength_seg = onset_strength[current_strength_onset:strength_onset_times[onset_idx + 1]] + + if max(onset_strength_seg) < 3: + strength_onset_times = np.delete(strength_onset_times, onset_idx) + else: + thd_corrected_onsets.append(corrected_onsets[onset_idx]) + + else: + return [0] + + thd_corrected_onsets.sort() + if thd_corrected_onsets: + return thd_corrected_onsets + else: + return [0] + + +def get_bandwidth_array(audio_samples, fs, nperseg=512, overlap_step=32, rolloff_thresh=0.01, + rollon_thresh_percent=0.05, log_bandwidth=False, return_centroid=False, + low_bandwidth_method='Percentile', normalisation_method='RMS_Time_Window'): + """ + Calculate the bandwidth array estimate for an audio signal. + + Required inputs + :param audio_samples: array of the audio samples + :param fs: samplerate of the audio samples + + Optional inputs + :param nperseg: numper of samples used for calculating spectrogram + :param overlap_step: number of samples overlap for calculating spectrogram + :param rolloff_thresh: threshold value for calculating rolloff frequency + :param rollon_thresh_percent: percentage threshold for calculating rollon frequency + :param log_bandwidth: return the logarithm of the bandwdith, default to False + :param return_centroid: return the centroid for each time window + :param low_bandwidth_method: method for calculating the low frequency limit of the bandwidth, + default to 'Percentile' + :param normalisation_method: method for normlaising the spectrogram, default to 'RMS_Time_Window' + + :return: returns the bandwidth array, time array (from spectrogram), and + frequency array (from spectrogram). + """ + noverlap = nperseg - overlap_step + # get spectrogram + f, t, spec = spectrogram(audio_samples, fs, window='boxcar', nperseg=nperseg, noverlap=noverlap, scaling='density', + mode='magnitude') + + # normalise the spectrogram + if normalisation_method == 'Single_TF_Bin': + spec /= np.max(spec) + elif normalisation_method == 'RMS_Time_Window': + spec /= np.max(np.sqrt(np.sum(spec * spec, axis=0))) + elif normalisation_method == "none": + pass + else: + raise ValueError('Bandwidth normalisation method must be \'Single_TF_Bin\' or \'RMS_Time_Window\'') + + # get values for thresholding + level_with_time = np.sum(spec, axis=0) + max_l = np.max(level_with_time) + min_l = np.min(level_with_time) + min_tpower = (0.1 * (max_l - min_l)) + min_l + + + # initialise lists for storage + rollon = [] + rolloff = [] + bandwidth = [] + centroid = [] + centroid_power = [] + + # calculate the bandwidth curve + for time_count in range(len(t)): + seg = spec[:, time_count] + tpower = np.sum(seg) + if tpower > min_tpower: + if low_bandwidth_method == 'Percentile': + # get the spectral rollon + rollon_counter = 1 + cumulative_power = np.sum(seg[:rollon_counter]) + rollon_thresh = tpower * rollon_thresh_percent + + while cumulative_power < rollon_thresh: + rollon_counter += 1 + cumulative_power = np.sum(seg[:rollon_counter]) + rollon.append(f[rollon_counter - 1]) + elif low_bandwidth_method == 'Cutoff': + rollon_idx = np.where(seg >= rolloff_thresh)[0] + if len(rollon_idx): + rollon_idx = rollon_idx[0] + rollon.append(f[rollon_idx]) + else: + raise ValueError('low_bandwidth_method must be \'Percentile\' or \'Cutoff\'') + + # get the spectral rolloff + rolloff_idx = np.where(seg >= rolloff_thresh)[0] + if len(rolloff_idx): + rolloff_idx = rolloff_idx[-1] + rolloff.append(f[rolloff_idx]) + if log_bandwidth: + bandwidth.append(np.log(f[rolloff_idx] / float(f[rollon_counter - 1]))) + else: + bandwidth.append(f[rolloff_idx] - f[rollon_counter - 1]) + else: + bandwidth.append(0) + + # get centroid values + centroid.append(np.sum(seg * f) / np.sum(seg)) + centroid_power.append(tpower) + else: + bandwidth.append(0) + + if return_centroid: + return bandwidth, t, f, np.average(centroid, weights=centroid_power) + else: + return bandwidth, t, f + + +def calculate_bandwidth_gradient(bandwidth_segment, t): + """ + Calculate the gradient ferom the bandwidth array + + :param bandwidth_segment: segment of bandwdith for calculation + :param t: time base for calculating + + :return: gradient of the bandwidth + """ + if bandwidth_segment: + max_idx = np.argmax(bandwidth_segment) + if max_idx > 0: + min_idx = np.where(np.array(bandwidth_segment[:max_idx]) == min(bandwidth_segment[:max_idx]))[0][-1] + + bandwidth_change = bandwidth_segment[max_idx] - bandwidth_segment[min_idx] + time_to_change = (max_idx - min_idx) * (t[1] - t[0]) + + bandwidth_gradient = bandwidth_change / time_to_change + else: + bandwidth_gradient = False + else: + bandwidth_gradient = False + return bandwidth_gradient + + +def calculate_rms_enveope(audio_samples, step_size=256, overlap_step=256, normalise=True): + """ + Calculate the RMS envelope of the audio signal. + + :param audio_samples: numpy array, the audio samples. + :param step_size: int, number of samples to get the RMS from. + :param overlap_step: int, number of samples to overlap. + + :return: RMS array + """ + # initialise lists and counters + rms_envelope = [] + i = 0 + t_hold = [] + # step through the signal + while i < len(audio_samples) - step_size: + rms_envelope.append(np.sqrt(np.mean(audio_samples[i:i + step_size] * audio_samples[i:i + step_size]))) + i += overlap_step + + # use the remainder of the array for a final sample + t_hold.append(i) + rms_envelope.append(np.sqrt(np.mean(audio_samples[i:] * audio_samples[i:]))) + rms_envelope = np.array(rms_envelope) + + # normalise to peak value + if normalise: + rms_envelope = rms_envelope * (1.0 / max(abs(rms_envelope))) + + return rms_envelope + + +def detect_peaks(array, freq=0, cthr=0.2, unprocessed_array=False, fs=44100): + """ + Function detects the peaks in array, based from the mirpeaks algorithm. + + :param array: Array in which to detect peaks + :param freq: Scale representing the x axis (sample length as array) + :param cthr: Threshold for checking adjacent peaks + :param unprocessed_array: Array that in unprocessed (normalised), if False will default to the same as array. + :param fs: Sampe rate of the array + + :return: index of peaks, values of peaks, peak value on freq. + """ + # flatten the array for correct processing + array = array.flatten() + + if np.isscalar(freq): + # calculate the frerquency scale - assuming a samplerate if none provided + freq = np.linspace(0, fs/2.0, len(array)) + + if np.isscalar(unprocessed_array): + unprocessed_array = array + + # add values to allow peaks at the first and last values + array_appended = np.insert(array, [0, len(array)], -2.0) # to allow peaks at start and end (default of mir) + # unprocessed array to get peak values + array_unprocess_appended = np.insert(unprocessed_array, [0, len(unprocessed_array)], -2.0) + # append the frequency scale for precise freq calculation + freq_appended = np.insert(freq, [0, len(freq)], -1.0) + + # get the difference values + diff_array = np.diff(array_appended) + + # find local maxima + mx = np.array(np.where((array >= cthr) & (diff_array[0:-1] > 0) & (diff_array[1:] <= 0))) + 1 + + # initialise arrays for output + finalmx = [] + peak_value = [] + peak_x = [] + peak_idx = [] + + if np.size(mx) > 0: + # unpack the array if peaks found + mx = mx[0] + + j = 0 # scans the peaks from beginning to end + mxj = mx[j] # the current peak under evaluation + jj = j + 1 + bufmin = 2.0 + bufmax = array_appended[mxj] + + if mxj > 1: + oldbufmin = min(array_appended[:mxj-1]) + else: + oldbufmin = array_appended[0] + + while jj < len(mx): + # if adjacent mx values are too close, returns no array + if mx[jj-1]+1 == mx[jj]-1: + bufmin = min([bufmin, array_appended[mx[jj-1]]]) + else: + bufmin = min([bufmin, min(array_appended[mx[jj-1]:mx[jj]-1])]) + + if bufmax - bufmin < cthr: + # There is no contrastive notch + if array_appended[mx[jj]] > bufmax: + # new peak is significant;y higher than the old peak, + # the peak is transfered to the new position + j = jj + mxj = mx[j] # the current peak + bufmax = array_appended[mxj] + oldbufmin = min([oldbufmin, bufmin]) + bufmin = 2.0 + elif array_appended[mx[jj]] - bufmax <= 0: + bufmax = max([bufmax, array_appended[mx[jj]]]) + oldbufmin = min([oldbufmin, bufmin]) + + else: + # There is a contrastive notch + if bufmax - oldbufmin < cthr: + # But the previous peak candidate is too weak and therefore discarded + oldbufmin = min([oldbufmin, bufmin]) + else: + # The previous peak candidate is OK and therefore stored + finalmx.append(mxj) + oldbufmin = bufmin + + bufmax = array_appended[mx[jj]] + j = jj + mxj = mx[j] # The current peak + bufmin = 2.0 + + jj += 1 + if bufmax - oldbufmin >= cthr and (bufmax - min(array_appended[mx[j] + 1:]) >= cthr): + # The last peak candidate is OK and stored + finalmx.append(mx[j]) + + ''' Sort the values according to their level ''' + finalmx = np.array(finalmx) + sort_idx = np.argsort(array_appended[finalmx])[::-1] # descending sort + finalmx = finalmx[sort_idx] + + peak_idx = finalmx - 1 # indexes were for the appended array, -1 to return to original array index + peak_value = array_unprocess_appended[finalmx] + peak_x = freq_appended[finalmx] + + ''' Interpolation for more precise peak location ''' + corrected_value = [] + corrected_position = [] + for current_peak_idx in finalmx: + # if there enough space to do the fitting + if 1 < current_peak_idx < (len(array_unprocess_appended) - 2): + y0 = array_unprocess_appended[current_peak_idx] + ym = array_unprocess_appended[current_peak_idx-1] + yp = array_unprocess_appended[current_peak_idx+1] + p = (yp - ym) / (2 * (2*y0 - yp - ym)) + corrected_value.append(y0 - (0.25*(ym-yp)*p)) + if p >= 0: + correct_pos = ((1 - p) * freq_appended[current_peak_idx]) + (p * freq_appended[current_peak_idx+1]) + corrected_position.append(correct_pos) + elif p < 0: + correct_pos = ((1 + p) * freq_appended[current_peak_idx]) - (p * freq_appended[current_peak_idx-1]) + corrected_position.append(correct_pos) + else: + corrected_value.append(array_unprocess_appended[current_peak_idx]) + corrected_position.append(freq_appended[current_peak_idx]) + + if corrected_position: + peak_x = corrected_position + peak_value = corrected_value + + return peak_idx, peak_value, peak_x + + +def sigmoid(x, offset=0.2, n=10): + # return a sigmoidal function for weighting values + return x ** n / (x ** n + offset) + + +def channel_reduction(audio_samples, phase_correction=False): + """ + Algorithm for reducing the number of channels in a read-in audio file + + :param audio_samples: audio samples + :param phase_correction: perform phase checking on channels before mono sum + + :return: audio samples summed to mono + """ + # get sum all channels to mono + num_channels = np.shape(audio_samples) + if len(num_channels) > 1: + # check for stereo file + if num_channels[1] == 2: + # crudely check for out of phase signals + if phase_correction: + r, pval = scipy.stats.pearsonr(audio_samples[:, 0], audio_samples[:, 1]) + if r < -0.5: + audio_samples = audio_samples[:, 0] # [:,1] *= -1.0 + else: + audio_samples = np.sum(audio_samples, axis=1) + else: + audio_samples = np.sum(audio_samples, axis=1) + # check for multi-channel file + elif num_channels[1] > 2: + # there are multiple layouts for multichannel, I have no way of decoding these with soundfile + audio_samples = np.sum(audio_samples[:,0:3], axis=1) + + #TODO Update to include multichannel variants and decode according to: http://www.atsc.org/wp-content/uploads/2015/03/A52-201212-17.pdf + # elif num_channels[3] > 4: + # elif num_channels[3] > 5: + # elif num_channels[3] > 6: + + return audio_samples + + +def spectral_flux(spectrogram, method='sum'): + """ + This computes the spectral flux: the difference between sucesive spectrogram time frames + + :param spectrogram: + :return: + """ + if method == 'sum': + # sum method + diff_spec = np.diff(spectrogram, axis=1) # difference + sum_flux = np.sqrt(np.sum(diff_spec**2, axis=0))/float(diff_spec.shape[0]); + + return sum_flux + + elif method == 'multiply': + # multiplication between adjacent frames + diff_spec = spectrogram[:,:-1] * spectrogram[:,1:] + sum_diff_spec = (np.sum(diff_spec ** 2.0, axis=0)) # variation acorss time + orig_spec_var = np.sum(spectrogram[:,:-1]** 2.0, axis=0) + delayed_spec_var = np.sum(spectrogram[:,1:]** 2.0, axis=0) + denom = orig_spec_var * delayed_spec_var + + multiply_flux = np.nan_to_num(1 - sum_diff_spec / (orig_spec_var * delayed_spec_var)) + + return multiply_flux + + +def log_sum(array): + """ + This function calculates the log sum of an array + + :param array: + :return: + """ + logsum = 10 * np.log10(np.sum(10 ** (array / 10.0))) + + return logsum + + +def filter_design2(Fc, fs, N): + """ + Design Butterworth 2nd-order one-third-octave filter. + """ + + f1 = (2.0 ** (-1.0/6)) * Fc + f2 = (2.0 ** (1.0/6)) * Fc + f1 = f1 / (fs / 2.0) + f2 = f2 / (fs / 2.0) + + # force f2 to be 1.0 for cases where the upper bandwidth from 3rd_octave_downsample produce higher frequencies + if f2 >= 1.0: + f2 = 0.9999999999 + b, a = scipy.signal.butter(N, [f1, f2], 'bandpass') + return b, a + + +def midbands(Fmin, Fmax, fs): + """ + Divides the frequency range into third octave bands using filters + Fmin is the minimum third octave band + Fmax is the maximum third octave band + """ + + # set defaults + lowest_band = 25 + highest_band = 20000 + Nyquist_frequency = fs / 2.0 + FUpper = (2 ** (1/6.0)) * Fmax + + fr = 1000 # reference frequency is 1000Hz + i = np.arange(-16, 14, 1) + lab_freq = np.array([25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600, + 2000, 2500, 3150, 4000, 5000, 6300, 8000, 10000, 12500, 16000, 20000]) + + A = np.where(lab_freq == Fmin)[0][0] + B = np.where(lab_freq == Fmax)[0][0] + + # compare value of B to nyquist + while lab_freq[B] > Nyquist_frequency: + B -= 1 + + + j = i[np.arange(A, B+1, 1)] # indices to find exact midband frequencies + ff = (2.0 ** (j / 3.0)) * fr # Exact midband frequencies (Calculated as base two exact) + F = lab_freq[np.arange(A, B+1, 1)] + return ff, F, j + + +def filter_third_octaves_downsample(x, Pref, fs, Fmin, Fmax, N): + """ + Filters the audio file into thrid octave bands + x is the file (Input length must be a multiple of 2^8) + Pref is the reference level for calculating decibels - does not allow for negative values + Fmin is the minimum frequency + Fmax is the maximum frequency (must be at least 2500 Hz) + Fs is the sampling frequency + N is the filter order + """ + # identify midband frequencies + [ff, F, j] = midbands(Fmin, Fmax, fs) + + # apply filters + P = np.zeros(len(j)) + k = np.where(j == 7)[0][0] # Determines where downsampling will commence (5000 Hz and below) + m = len(x) + + # For frequencies of 6300 Hz or higher, direct implementation of filters. + for i in range(len(j)-1, k, -1): + B, A = filter_design2(ff[i], fs, N) + if i==k+3: # Upper 1/3-oct. band in last octave. + Bu = B; + Au = A; + if i == k + 2: # Center 1/3-oct. band in last octave. + Bc = B; + Ac = A; + if i == k + 1: # Lower 1/3-oct. band in last octave. + Bl = B; + Al = A; + y = scipy.signal.lfilter(B, A, x); + if np.max(y) > 0: + P[i] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0) / m)) # Convert to decibels. + else: + P[i] = -1.0 * np.inf + + # 5000 Hz or lower, multirate filter implementation. + try: + for i in range(k, 1, -3): #= k:-3:1; + # Design anti-aliasing filter (IIR Filter) + Wn = 0.4 + C, D = scipy.signal.cheby1(2, 0.1, Wn) + # Filter + x = scipy.signal.lfilter(C, D, x) + # Downsample + idx = np.arange(1, len(x), 2) + x = x[idx] + fs = fs / 2.0 + m = len(x) + # Performs the filtering + y = scipy.signal.lfilter(Bu, Au, x) + if np.max(y) > 0: + P[i] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0)/m)) + else: + P[i] = -1.0 * np.inf + y = scipy.signal.lfilter(Bc, Ac, x) + if np.max(y) > 0: + P[i-1] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0)/m)) + else: + P[i-1] = -1.0 * np.inf + y = scipy.signal.lfilter(Bl, Al, x) + if np.max(y) > 0: + P[i - 2] = 20 * np.log10(np.sqrt(np.sum(y ** 2.0) / m)) + else: + P[i-2] = -1.0 * np.inf + except: + P = P[1:len(j)] + + # "calibrate" the readings based from Pref, chosen as 100 in most uses + P = P + Pref + + # log transformation + Plog = 10 ** (P / 10.0) + Ptotal = np.sum(Plog) + if Ptotal > 0: + Ptotal = 10 * np.log10(Ptotal) + else: + Ptotal = -1.0 * np.inf + + return Ptotal, P, F + + +def specific_loudness(x, Pref, fs, Mod): + """ + Calculates loudness in 3rd octave bands + based on ISO 532 B / DIN 45631 + Source: BASIC code in J Acoust Soc Jpn(E) 12, 1(1991) + x = signal + Pref = refernce value[dB] + fs = sampling frequency[Hz] + Mod = 0 for free field + Mod = 1 for diffuse field + + Returns + N_entire = entire loudness[sone] + N_single = partial loudness[sone / Bark] + + Original Matlab code by Claire Churchill Jun. 2004 + Transcoded by Andy Pearce 2018 + """ + + # 'Generally used third-octave band filters show a leakage towards neighbouring filters of about -20 dB. This + # means that a 70dB, 1 - kHz tone produces the following levels at different centre + # frequencies: 10dB at 500Hz, 30dB at 630Hz, 50dB at 800Hz and 70dB at 1kHz. + # P211 Psychoacoustics: Facts and Models, E.Zwicker and H.Fastl + # (A filter order of 4 gives approx this result) + + # set default + Fmin = 25 + Fmax = 12500 + order = 4 + # filter the audio + Ptotal, P, F = filter_third_octaves_downsample(x, Pref, fs, Fmin, Fmax, order); + + + # set more defaults for perceptual filters + + # Centre frequencies of 1 / 3 Oct bands(FR) + FR = np.array([25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600, + 2000, 2500, 3150, 4000, 5000, 6300, 8000, 10000, 12500]) + + # Ranges of 1 / 3 Oct bands for correction at low frequencies according to equal loudness contours + RAP = np.array([45, 55, 65, 71, 80, 90, 100, 120]) + + # Reduction of 1/3 Oct Band levels at low frequencies according to equal loudness contours + # within the eight ranges defined by RAP(DLL) + DLL = np.array([[-32, -24, -16, -10, -5, 0, -7, -3, 0, -2, 0], + [-29, -22, -15, -10, -4, 0, -7, -2, 0, -2, 0], + [-27, -19, -14, -9, -4, 0, -6, -2, 0, -2, 0], + [-25, -17, -12, -9, -3, 0, -5, -2, 0, -2, 0], + [-23, -16, -11, -7, -3, 0, -4, -1, 0, -1, 0], + [-20, -14, -10, -6, -3, 0, -4, -1, 0, -1, 0], + [-18, -12, -9, -6, -2, 0, -3, -1, 0, -1, 0], + [-15, -10, -8, -4, -2, 0, -3, -1, 0, -1, 0]]) + + # Critical band level at absolute threshold without taking into account the + # transmission characteristics of the ear + LTQ = np.array([30, 18, 12, 8, 7, 6, 5, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]) # Threshold due to internal noise + # Hearing thresholds for the excitation levels (each number corresponds to a critical band 12.5kHz is not included) + + # Attenuation representing transmission between freefield and our hearing system + A0 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5, -1.6, -3.2, -5.4, -5.6, -4, -1.5, 2, 5, 12]) + # Attenuation due to transmission in the middle ear + # Moore et al disagrees with this being flat for low frequencies + + # Level correction to convert from a free field to a diffuse field(last critical band 12.5 kHz is not included) + DDF = np.array([0, 0, 0.5, 0.9, 1.2, 1.6, 2.3, 2.8, 3, 2, 0, -1.4, -2, -1.9, -1, 0.5, 3, 4, 4.3, 4]) + + # Correction factor because using third octave band levels(rather than critical bands) + DCB = np.array([-0.25, -0.6, -0.8, -0.8, -0.5, 0, 0.5, 1.1, 1.5, 1.7, 1.8, 1.8, 1.7, 1.6, 1.4, 1.2, 0.8, + 0.5, 0, -0.5]) + + # Upper limits of the approximated critical bands + ZUP = np.array([0.9, 1.8, 2.8, 3.5, 4.4, 5.4, 6.6, 7.9, 9.2, 10.6, 12.3, 13.8, 15.2, 16.7, 18.1, 19.3, 20.6, 21.8, + 22.7, 23.6, 24]) + + # Range of specific loudness for the determination of the steepness of the upper slopes in the specific loudness + # - critical band rate pattern(used to plot the correct USL curve) + RNS = np.array([21.5, 18, 15.1, 11.5, 9, 6.1, 4.4, 3.1, 2.13, 1.36, 0.82, 0.42, 0.30, 0.22, 0.15, 0.10, 0.035, 0]) + + # This is used to design the right hand slope of the loudness + USL = np.array([[13.0, 8.2, 6.3, 5.5, 5.5, 5.5, 5.5, 5.5], + [9.0, 7.5, 6.0, 5.1, 4.5, 4.5, 4.5, 4.5], + [7.8, 6.7, 5.6, 4.9, 4.4, 3.9, 3.9, 3.9], + [6.2, 5.4, 4.6, 4.0, 3.5, 3.2, 3.2, 3.2], + [4.5, 3.8, 3.6, 3.2, 2.9, 2.7, 2.7, 2.7], + [3.7, 3.0, 2.8, 2.35, 2.2, 2.2, 2.2, 2.2], + [2.9, 2.3, 2.1, 1.9, 1.8, 1.7, 1.7, 1.7], + [2.4, 1.7, 1.5, 1.35, 1.3, 1.3, 1.3, 1.3], + [1.95, 1.45, 1.3, 1.15, 1.1, 1.1, 1.1, 1.1], + [1.5, 1.2, 0.94, 0.86, 0.82, 0.82, 0.82, 0.82], + [0.72, 0.67, 0.64, 0.63, 0.62, 0.62, 0.62, 0.62], + [0.59, 0.53, 0.51, 0.50, 0.42, 0.42, 0.42, 0.42], + [0.40, 0.33, 0.26, 0.24, 0.24, 0.22, 0.22, 0.22], + [0.27, 0.21, 0.20, 0.18, 0.17, 0.17, 0.17, 0.17], + [0.16, 0.15, 0.14, 0.12, 0.11, 0.11, 0.11, 0.11], + [0.12, 0.11, 0.10, 0.08, 0.08, 0.08, 0.08, 0.08], + [0.09, 0.08, 0.07, 0.06, 0.06, 0.06, 0.06, 0.05], + [0.06, 0.05, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02]]) + + # apply weighting factors + Xp = np.zeros(11) + Ti = np.zeros(11) + for i in range(11): + j = 0 + while (P[i] > (RAP[j] - DLL[j, i])) & (j < 7): + j += 1 + Xp[i] = P[i] + DLL[j, i] + Ti[i] = 10.0 ** (Xp[i] / 10.0) + + # Intensity values in first three critical bands calculated + Gi = np.zeros(3) + Gi[0] = np.sum(Ti[0:6]) # Gi(1) is the first critical band (sum of two octaves(25Hz to 80Hz)) + Gi[1] = np.sum(Ti[6:9]) # Gi(2) is the second critical band (sum of octave(100Hz to 160Hz)) + Gi[2] = np.sum(Ti[9:11]) # Gi(3) is the third critical band (sum of two third octave bands(200Hz to 250Hz)) + + if np.max(Gi) > 0.0: + FNGi = 10 * np.log10(Gi) + else: + FNGi = -1.0 * np.inf + LCB = np.zeros_like(Gi) + for i in range(3): + if Gi[i] > 0: + LCB[i] = FNGi[i] + else: + LCB[i] = 0 + + # Calculate the main loudness in each critical band + Le = np.ones(20) + Lk = np.ones_like(Le) + Nm = np.ones(21) + for i in range(20): + Le[i] = P[i+8] + if i <= 2: + Le[i] = LCB[i] + Lk[i] = Le[i] - A0[i] + Nm[i] = 0 + if Mod == 1: + Le[i] = Le[i] + DDF[i] + if Le[i] > LTQ[i]: + Le[i] = Lk[i] - DCB[i] + S = 0.25 + MP1 = 0.0635 * 10.0 ** (0.025 * LTQ[i]) + MP2 = (1 - S + S * 10 ** (0.1 * (Le[i] - LTQ[i]))) ** 0.25 - 1 + Nm[i] = MP1 * MP2; + if Nm[i] <= 0: + Nm[i] = 0 + Nm[20] = 0 + + KORRY = 0.4 + 0.32 * Nm[0] ** 0.2 + if KORRY > 1: + KORRY = 1 + + Nm[0] = Nm[0] * KORRY + + # Add masking curves to the main loudness in each third octave band + N = 0 + z1 = 0 # critical band rate starts at 0 + n1 = 0 # loudness level starts at 0 + j = 17 + iz = 0 + z = 0.1 + ns = [] + + for i in range(21): + # Determines where to start on the slope + ig = i-1 + if ig > 7: + ig = 7 + control = 1 + while (z1 < ZUP[i]) | (control == 1): # ZUP is the upper limit of the approximated critical band + # Determines which of the slopes to use + if n1 < Nm[i]: # Nm is the main loudness level + j = 0 + while RNS[j] > Nm[i]: # the value of j is used below to build a slope + j += 1 # j becomes the index at which Nm(i) is first greater than RNS + + # The flat portions of the loudness graph + if n1 <= Nm[i]: + z2 = ZUP[i] # z2 becomes the upper limit of the critical band + n2 = Nm[i] + N = N + n2 * (z2 - z1) # Sums the output(N_entire) + for k in np.arange(z, z2+0.01, 0.1): + if not ns: + ns.append(n2) + else: + if iz == len(ns): + ns.append(n2) + elif iz < len(ns): + ns[iz] = n2 + + if k < (z2 - 0.05): + iz += 1 + z = k # z becomes the last value of k + z = round(z * 10) * 0.1 + + # The sloped portions of the loudness graph + if n1 > Nm[i]: + n2 = RNS[j] + if n2 < Nm[i]: + n2 = Nm[i] + dz = (n1 - n2) / USL[j, ig] # USL = slopes + dz = round(dz * 10) * 0.1 + if dz == 0: + dz = 0.1 + z2 = z1 + dz + if z2 > ZUP[i]: + z2 = ZUP[i] + dz = z2 - z1 + n2 = n1 - dz * USL[j, ig] # USL = slopes + N = N + dz * (n1 + n2) / 2.0 # Sums the output(N_entire) + for k in np.arange(z, z2+0.01, 0.1): + if not ns: + ns.append(n1 - (k - z1) * USL[j, ig]) + else: + if iz == len(ns): + ns.append(n1 - (k - z1) * USL[j, ig]) + elif iz < len(ns): + ns[iz] = n1 - (k - z1) * USL[j, ig] + if k < (z2 - 0.05): + iz += 1 + z = k + z = round(z * 10) * 0.1 + if n2 == RNS[j]: + j += 1 + if j > 17: + j = 17 + n1 = n2 + z1 = z2 + z1 = round(z1 * 10) * 0.1 + control += 1 + + if N < 0: + N = 0 + + if N <= 16: + N = np.floor(N * 1000 + 0.5) / 1000.0 + else: + N = np.floor(N * 100 + .05) / 100.0 + + LN = 40.0 * (N + 0.0005) ** 0.35 + + if LN < 3: + LN = 3 + + if N >= 1: + LN = 10 * np.log10(N) / np.log10(2) + 40; + + N_single = np.zeros(240) + for i in range(240): + N_single[i] = ns[i] + + N_entire = N + return N_entire, N_single + + +def output_clip(score, min_score=0, max_score=100): + """ + Limits the output of the score between min_score and max_score + + :param score: + :param min_score: + :param max_score: + :return: + """ + if score < min_score: + return 0.0 + elif score > max_score: + return 100.0 + else: + return score + + +def fast_hilbert(array, use_matlab_hilbert=False): + """ + Calculates the hilbert transform of the array by segmenting signal first to speed up calculation. + :param array: + :return: + """ + step_size = 32768 + overlap = 2 + overlap_size = int(step_size/(2*overlap)) + # how many steps, rounded to nearest int + # step_no = int((len(array) / (step_size - overlap)) + 0.5) + step_start = 0 + hold_hilbert = np.array([]) + while (step_start + step_size) < len(array): + hold_array = array[step_start:step_start+step_size] + if use_matlab_hilbert: + this_hilbert = np.abs(matlab_hilbert(hold_array)) + else: + this_hilbert = np.abs(scipy.signal.hilbert(hold_array)) + + if step_start == 0: + # try to concatonate the results + hold_hilbert = np.concatenate((hold_hilbert,this_hilbert[:3*overlap_size])) + else: + hold_hilbert = np.concatenate((hold_hilbert, this_hilbert[overlap_size:3*overlap_size])) + + # increment the step + step_start += int(step_size/overlap) + + # do the last step + hold_array = array[step_start:] + this_hilbert = np.abs(scipy.signal.hilbert(hold_array)) + + # try to concatonate the results + hold_hilbert = np.concatenate((hold_hilbert, this_hilbert[overlap_size:])) + return hold_hilbert + + +def fast_hilbert_spectrum(array, use_matlab_hilbert=False): + """ + Calculates the hilbert transform of the array by segmenting signal first to speed up calculation. + :param array: + :return: + """ + step_size = 32768 + overlap = 2 + overlap_size = int(step_size/(2*overlap)) + step_start = 0 + hold_HILBERT = [] + if (step_start + step_size) < len(array): + while (step_start + step_size) < len(array): + hold_array = array[step_start:step_start+step_size] + if use_matlab_hilbert: + this_hilbert = np.abs(matlab_hilbert(hold_array)) + else: + this_hilbert = np.abs(scipy.signal.hilbert(hold_array)) + + HILBERT = np.abs(np.fft.fft(np.abs(this_hilbert))) + HILBERT = HILBERT[0:int(len(HILBERT) / 2.0)] # take the real part + hold_HILBERT.append(HILBERT) + + step_start += int(step_size/overlap) + + # hilbert_spectrum = np.sum(hold_HILBERT, axis=0) + hilbert_spectrum = np.mean(hold_HILBERT, axis=0) + + else: + # how much to pad by + array = np.pad(array, (0, step_size - len(array)), 'constant', constant_values=0.0) + + if use_matlab_hilbert: + this_hilbert = np.abs(matlab_hilbert(array)) + else: + this_hilbert = np.abs(scipy.signal.hilbert(array)) + + HILBERT = np.abs(np.fft.fft(np.abs(this_hilbert))) + HILBERT = HILBERT[0:int(len(HILBERT) / 2.0)] # take the real part + + hilbert_spectrum = HILBERT + + return hilbert_spectrum + + +def matlab_hilbert(signal): + ''' + Define a method for calculating the hilbert transform of a 1D array using the method from Matlab + + :param signal: + :return: + ''' + # get the fft + n = len(signal) + x = np.fft.fft(signal) + h = np.zeros(n) + + if (n>0) and (~isodd(n)): + # even and nonempty + h[0] = 1 + h[int(n/2)] = 1 + h[1:int(n/2)] = 2 + elif n>0: + # odd and nonempty + h[0] = 1 + h[1:int((n+1)/2.0)] = 2 + + # this is the hilbert bit + x = np.fft.ifft(x * h) + + return x + + + +def isodd(num): + return num & 0x1 + + +def window_audio(audio_samples, window_length=4096): + """ + Segment the audio samples into a numpy array the correct size and shape, so that each row is a new window of audio + :param audio_samples: + :param window_length: + :param overlap: + :return: + """ + remainder = np.mod(len(audio_samples), window_length) # how many samples are left after division + + #zero pad audio samples + audio_samples = np.pad(audio_samples, (0, int(window_length-remainder)), 'constant', constant_values=0.0) + windowed_samples = np.reshape(audio_samples, (int(len(audio_samples) / window_length), int(window_length))) + + return windowed_samples + + +def normal_dist(array, theta=1.0, mean=0.0): + y = (1.0 / (theta * np.sqrt(2.0 * np.pi))) * np.exp((-1.0 * ((array - mean)**2.0)) / 2.0 * (theta ** 2.0)) + return y + + +def weighted_bark_level(audio_samples, fs, low_bark_band=0, upper_bark_band=240): + #window the audio + windowed_samples = window_audio(audio_samples) + + # need to define a function for the roughness stimuli, emphasising the 20 - 40 region (of the bark scale) + mean_bark_band = (low_bark_band + upper_bark_band) / 2.0 + array = np.arange(low_bark_band, upper_bark_band) + x = normal_dist(array, theta=0.01, mean=mean_bark_band) + x -= np.min(x) + x /= np.max(x) + + weight_array = np.zeros(240) + weight_array[low_bark_band:upper_bark_band] = x + + windowed_loud_spec = [] + windowed_rms = [] + weighted_vals = [] + + for i in range(windowed_samples.shape[0]): + samples = windowed_samples[i, :] + N_entire, N_single = specific_loudness(samples, Pref=100.0, fs=fs, Mod=0) + + # append the loudness spec + windowed_loud_spec.append(N_single) + windowed_rms.append(np.sqrt(np.mean(samples * samples))) + weighted_vals.append(np.sum(weight_array * N_single)) + + mean_weight = np.mean(weighted_vals) + weighted_weight = np.average(weighted_vals, weights=windowed_rms) + + return mean_weight, weighted_weight + + + +''' + Loudnorm function to be included in future update +''' +def loud_norm(audio, fs=44100, target_loudness=-24.0): + ''' + Takes in audio data and returns the same audio loudness normalised + :param audio: + :param fs: + :param target_loudness: + :return: + ''' + meter = pyln.Meter(fs) + + # minimum length of file is 0.4 seconds + if len(audio) < (fs * 0.4): + # how much longer does the file need to be? + samples_needed = int(fs * 0.4) - len(audio) + + # zero pad signal + len_check_audio = np.pad(audio, (0, samples_needed), 'constant', constant_values=0.0) + else: + len_check_audio = audio + + # assess the current loudness + current_loudness = meter.integrated_loudness(len_check_audio) + normalised_audio = pyln.normalize.loudness(audio, current_loudness, target_loudness) + + # check for clipping and reduce level + if np.max(np.abs(normalised_audio)) > 1.0: + normalised_audio /= np.max(np.abs(normalised_audio)) + + return normalised_audio + + + + + +def file_read(fname, fs=0, phase_correction=False, mono_sum=True, loudnorm=True, resample_low_fs=True): + """ + Read in audio file, but check if it's already an array + Return samples if already an array. + :param fname: + :return: + """ + if isinstance(fname, six.string_types): + # use pysoundfile to read audio + audio_samples, fs = sf.read(fname, always_2d=False) + + elif hasattr(fname, 'shape'): + if fs==0: + raise ValueError('If giving function an array, \'fs\' must be specified') + audio_samples = fname + + else: + raise TypeError('Input type of \'fname\' must be string, or have a shape attribute (e.g. a numpy array)') + + # check audio file contains data + if audio_samples.size==0: + raise ValueError('Input audio file does not contain data') + + # reduce to mono + if mono_sum: + audio_samples = channel_reduction(audio_samples, phase_correction) + + # check data has values + if np.max(np.abs(audio_samples)) == 0.0: + raise ValueError('Input file is silence, cannot be analysed.') + + # loudness normalise + if loudnorm: + audio_samples = loud_norm(audio_samples, fs, target_loudness=-24.0) + + if resample_low_fs: + # check if upsampling required and perform to avoid errors + audio_samples, fs = check_upsampling(audio_samples, fs) + + return audio_samples, fs + + + +def check_upsampling(audio_samples, fs, lowest_fs=44100): + """ + Check if upsampling needfs to be applied, then perform it if necessary + + :param audio_samples: + :param fs: + :return: + """ + if fs < lowest_fs: + # upsample file to avoid errors when calculating specific loudness + audio_samples = librosa.core.resample(audio_samples, fs, lowest_fs) + fs = lowest_fs + + return audio_samples, fs diff --git a/timbral_models.egg-info/PKG-INFO b/timbral_models.egg-info/PKG-INFO index 7693e6f..34e7723 100644 --- a/timbral_models.egg-info/PKG-INFO +++ b/timbral_models.egg-info/PKG-INFO @@ -1,77 +1,101 @@ -Metadata-Version: 1.1 +Metadata-Version: 2.1 Name: timbral-models -Version: 0.2.2 +Version: 0.4.0 Summary: Algorithms for predicting the timbral characteristics of audio files Home-page: https://github.com/AudioCommons/timbral_models Author: Andy Pearce Author-email: andy.pearce@surrey.ac.uk License: Apache Software -Description-Content-Type: text/markdown -Description: # AudioCommons Timbral Models - This work is part of the [AudioCommons project](https://www.audiocommons.org). - This distribution contains Python scripts developed for extracting timbral attributes of audio files. - - More detailed explanations of how the models function can be found in Deliverable D5.6: Second prototype of timbral characterisation tools for semantically annotating non-musical content, available: http://www.audiocommons.org/materials/ - - - # Installing the package - The timbral_models package can be installed using the pip command. This will handle all dependencies. - ``` - pip install timbral_models - ``` - Note that during test installing the package with only basic python installed, an error occurred when installing dependencies. This can be overcome by first installing numpy, followed by timbral_models. - ``` - pip install numpy - pip install timbral_models - ``` - - - # Dependencies - The script can also be downloaded manually from the github repository (https://github.com/AudioCommons/timbral_models). If doing this, dependencies will need to be manually installed. The timbral models rely on several other easily accessible python packages: `numpy`, `soundfile`, `librosa`, `sklearn`, and `scipy`. These are all easily installed using the `pip install` command. e.g. - ``` - $ pip install numpy - $ pip install soundfile - $ pip install librosa - $ pip install scipy - $ pip install sklearn - ``` - - # Using the models - The models are written as a package which can be imported into a Python script. Within the package are seven methods that predict the *hardness*, *depth*, *brightness*, *warmth*, *sharpness*, *booming*, and *roughness* of an audio file. These are named `timbral_xxx(fname)`, where `xxx` represents the timbral model. - - To calculate the timbral attribute, give the method a string of the file name. The method will then read in the audio file internally and return the timbral characteristic, as described below. - - - # Model output - The *hardness*, *depth*, *brightness*, and *warmth* models predict subjective ratings of their respective attributes. Each model returns a float. These models were trained on subjective ratings ranging from 0 to 100, but can extend beyond this range. See Deliverbale D5.6 for full documentation on implementation and optional parameters. - - The *roughness*, *sharpness*, and *booming* models return a float representing their subjective attribute for the audio file. The minimum value is 0.0, but there is no upper limit on the maximum value. - - - # Example usage - - ``` - from timbral_models import timbral_brightness - - # generic file location - fname = '/Users/User/Music/AudioFileToTest.wav' - - # calculate brightness - brightness = timbral_brightness(fname) - ``` - - - # Version History - This section documents the version history of the timbral models. To download a specific version of the model that relate to a specific deliverable, please check this section and download the most recent version from that date. - - 2018/07/26 - Version 0.2 of timbral models, relates to Audio Commons Deliverable D5.6. This version of the repository relates to the software version 0.2 on PyPI. - - 2017/09/05 - Version 0.1 of timbral models, relates to Audio Commons Deliverable D5.3. This version of the repository relates to the software version 0.1 on PyPI. - - 2017/04/27 - Version 0.0 of the timbral models, relates to Audio Commons Deliverable D5.2. - -Platform: UNKNOWN Classifier: Development Status :: 2 - Pre-Alpha Classifier: Topic :: Multimedia :: Sound/Audio :: Analysis Classifier: License :: OSI Approved :: Apache Software License Classifier: Programming Language :: Python :: 2.7 +Description-Content-Type: text/markdown +License-File: LICENSE + +## AudioCommons Timbral Models +The timbral models were devleoped by the [Institute of Sound Recording (IoSR)](http://www.iosr.uk/AudioCommons/) at the University of Surrey, and was completed as part of the [AudioCommons project](https://www.audiocommons.org). + +The current distribution contains python scripts for predicting eight timbral characteristics: *hardness*, *depth*, *brightness*, *roughness*, *warmth*, *sharpness*, *booming*, and *reverberation*. + +More detailed explanations of how the models function can be found in Deliverable D5.8: Release of timbral characterisation tools for semantically annotating non-musical content, available: http://www.audiocommons.org/materials/ + + +## Installing the package +The timbral_models package can be installed using the pip command. This will handle installation of all dependencies. In the update to version 0.4, the dependency to essentia was removed and only pip installable packages are required. +``` +pip install timbral_models +``` + +Please note that during testing, pip was unable to install some of the dependencies and produced an error. In these cases, either rerun the `pip install timbral_models` command or install the offending dependency directly, e.g. `pip install numpy`. + +The package can also be installed locally and be made editable. To do this, clone the repository, navigate to the folder, and run the pip command `pip install -e .`. In this method, dependencies will not be installed. + + +## Dependencies +The script can also be downloaded manually from the github repository (https://github.com/AudioCommons/timbral_models). If doing this, dependencies will need to be manually installed. The timbral models rely on several other easily accessible python packages: `numpy`, `soundfile`, `librosa`, `sklearn`, and `scipy`. These are all easily installed using the `pip install` command. e.g. +``` +$ pip install numpy +$ pip install soundfile +$ pip install librosa +$ pip install scipy +$ pip install sklearn +$ pip install six +$ pip install pyloudnorm +``` + + +## Using the models +The models are formatted in a python package that can be simply imported into a Python script. +The timbral extractor can be used to extract all timbral attributes with a single function call. + +To calculate the timbral attributes, pass the timbral extractor function a string of the filename. The method will then read in the audio file internally and return all timbral characteristics. +``` +import timbral_models +fname = '/Documents/Music/TestAudio.wav' +timbre = timbral_models.timbral_extractor(fname) +``` +In this example, `timbre` will be a python dictionary containing the predicted *hardness*, *depth*, *brightness*, *roughness*, *warmth*, *sharpness*, *booming*, and *reverberation* of the specified audio file. + + +### Single attribute calculation + +Alternative, each timbral attribute can be calculated individually by calling the specific timbral function, e.g. `timbral_hardness(fname)`. +These are named `timbral_xxx(fname)`, where `xxx` represents the timbral model, and also require a string of the filename to be analysed. +``` +import timbral_models +fname = '/Documents/Music/TestAudio.wav' +timbre = timbral_models.timbral_hardness(fname) +``` + + +## Model output +The *hardness*, *depth*, *brightness*, *roughness*, *warmth*, *sharpness*, and *booming* are regression based models, trained on subjective ratings ranging from 0 to 100. However, the output may be beyond these ranges. +The `clip_output` optional parameter can be used to contrain the outputs between 0 and 100. +``` +timbre = timbral_models.timbral_extractor(fname, clip_output=True) +``` +For additional optional parameters, please see Deliverable D5.8. + +The *reverb* attribute is a classification model, returning 1 or 0, indicating if the file "sounds reverberant" or "does not sound reverberant" respectively. + + +## MATLAB Reverb model +Also contained in this repository is a full version of the timbral reverb model. For instruction on installing and using this, please see Deliverable D5.8. + +## Version History +This section documents the version history of the timbral models. To download a specific version of the model that relate to a specific deliverable, please check this section and download the most recent version from that date. + +2019/01/24 - Version 0.4 of timbral_models, relates to Audio Commons Deliverable D5.8. This version of the repository relates to the software version 0.4 on PyPI. + +2018/12/14 - Version 0.3 of timbral models, relates to Audio Commons Deliverable D5.7. This version of the repository relates to the software version 0.3 on PyPI. + +2018/07/26 - Version 0.2 of timbral models, relates to Audio Commons Deliverable D5.6. This version of the repository relates to the software version 0.2 on PyPI. + +2017/09/05 - Version 0.1 of timbral models, relates to Audio Commons Deliverable D5.3. This version of the repository relates to the software version 0.1 on PyPI. + +2017/04/27 - Version 0.0 of the timbral models, relates to Audio Commons Deliverable D5.2. + + +## Citation +For refencing these models, please reference Deliverable D5.8, available: http://www.audiocommons.org/materials/ diff --git a/timbral_models.egg-info/SOURCES.txt b/timbral_models.egg-info/SOURCES.txt index 1a922f4..c7c2782 100644 --- a/timbral_models.egg-info/SOURCES.txt +++ b/timbral_models.egg-info/SOURCES.txt @@ -1,10 +1,13 @@ +LICENSE README.md setup.cfg setup.py timbral_models/Timbral_Booming.py timbral_models/Timbral_Brightness.py timbral_models/Timbral_Depth.py +timbral_models/Timbral_Extractor.py timbral_models/Timbral_Hardness.py +timbral_models/Timbral_Reverb.py timbral_models/Timbral_Roughness.py timbral_models/Timbral_Sharpness.py timbral_models/Timbral_Warmth.py diff --git a/timbral_models.egg-info/requires.txt b/timbral_models.egg-info/requires.txt index 6c2b9d6..c47b3dd 100644 --- a/timbral_models.egg-info/requires.txt +++ b/timbral_models.egg-info/requires.txt @@ -2,4 +2,6 @@ numpy soundfile scipy librosa -sklearn +scikit-learn +pyloudnorm +six diff --git a/timbral_models/timbral_util.py b/timbral_models/timbral_util.py index 01c7255..c8f3eb3 100644 --- a/timbral_models/timbral_util.py +++ b/timbral_models/timbral_util.py @@ -747,7 +747,7 @@ def calculate_onsets(audio_samples, envelope_samples, fs, look_back_time=20, hys thd_corrected_onsets = [] # get the onset strength - onset_strength = librosa.onset.onset_strength(audio_samples, fs) + onset_strength = librosa.onset.onset_strength(y=audio_samples, sr=fs) strength_onset_times = np.array(np.array(corrected_onsets) / 512).astype('int') strength_onset_times.clip(min=0) From 138a02ab38c1336d93ac01c51154de1f3bb9d9df Mon Sep 17 00:00:00 2001 From: Rodrigo Loyola Date: Tue, 5 Dec 2023 15:10:14 +0100 Subject: [PATCH 4/4] Same librosa call changes with y= and sr= --- timbral_models/Timbral_Hardness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timbral_models/Timbral_Hardness.py b/timbral_models/Timbral_Hardness.py index 7f782d0..8ec6224 100644 --- a/timbral_models/Timbral_Hardness.py +++ b/timbral_models/Timbral_Hardness.py @@ -85,7 +85,7 @@ def timbral_hardness(fname, fs=0, dev_output=False, phase_correction=False, clip # calculate the onsets original_onsets = timbral_util.calculate_onsets(audio_samples, envelope, fs, nperseg=nperseg) - onset_strength = librosa.onset.onset_strength(audio_samples, fs) + onset_strength = librosa.onset.onset_strength(y=audio_samples, sr=fs) # If onsets don't exist, set it to time zero if not original_onsets: original_onsets = [0]