From 7af75c87914ed556c86d79d1c2096a405b402969 Mon Sep 17 00:00:00 2001 From: Gerardo Roa Date: Wed, 9 Oct 2024 11:45:05 +0100 Subject: [PATCH 01/10] replace jiwer with alt-eval Signed-off-by: Gerardo Roa --- recipes/cad2/task1/baseline/evaluate.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/recipes/cad2/task1/baseline/evaluate.py b/recipes/cad2/task1/baseline/evaluate.py index 4bee312a..0112d09b 100644 --- a/recipes/cad2/task1/baseline/evaluate.py +++ b/recipes/cad2/task1/baseline/evaluate.py @@ -11,7 +11,7 @@ import pyloudnorm as pyln import torch.nn import whisper -from jiwer import compute_measures +from alt_eval import compute_metrics from omegaconf import DictConfig from clarity.enhancer.multiband_compressor import MultibandCompressor @@ -108,7 +108,9 @@ def compute_intelligibility( hypothesis = scorer.transcribe(left_path.as_posix(), fp16=False)["text"] lyrics["hypothesis_left"] = hypothesis - left_results = compute_measures(reference, hypothesis) + left_results = compute_metrics( + reference, hypothesis, languages="en", include_other=False + ) # Compute right ear ear.set_audiogram(listener.audiogram_right) @@ -123,7 +125,9 @@ def compute_intelligibility( hypothesis = scorer.transcribe(right_path.as_posix(), fp16=False)["text"] lyrics["hypothesis_right"] = hypothesis - right_results = compute_measures(reference, hypothesis) + right_results = compute_metrics( + reference, hypothesis, languages="en", include_other=False + ) # Compute the average score for both ears total_words = ( From 1bae315cabbff5f02be042c6ab9cc34e163416d3 Mon Sep 17 00:00:00 2001 From: Gerardo Roa Date: Wed, 9 Oct 2024 12:38:45 +0100 Subject: [PATCH 02/10] Add verbose True, False to MSGB to reduce outputs. defauls TRUE for back compatibility Signed-off-by: Gerardo Roa --- clarity/evaluator/msbg/cochlea.py | 11 +++++++-- clarity/evaluator/msbg/msbg.py | 35 +++++++++++++++++++--------- clarity/evaluator/msbg/msbg_utils.py | 14 +++++++---- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/clarity/evaluator/msbg/cochlea.py b/clarity/evaluator/msbg/cochlea.py index 5d47bfbb..63c74808 100644 --- a/clarity/evaluator/msbg/cochlea.py +++ b/clarity/evaluator/msbg/cochlea.py @@ -15,6 +15,7 @@ from clarity.evaluator.msbg.smearing import Smearer from clarity.utils.audiogram import Audiogram + # TODO: Fix power overflow error when (expansion_ratios[ixch] - 1) < 0 @@ -224,7 +225,11 @@ class Cochlea: """ def __init__( - self, audiogram: Audiogram, catch_up_level: float = 105.0, fs: float = 44100.0 + self, + audiogram: Audiogram, + catch_up_level: float = 105.0, + fs: float = 44100.0, + verbose=True, ) -> None: """Cochlea constructor. @@ -233,6 +238,7 @@ def __init__( catch_up_level (float, optional): loudness catch-up level in dB Default is 105 dB fs (float, optional): sampling frequency + verbose (bool, optional): verbose mode. Default is True """ self.fs = fs @@ -254,7 +260,8 @@ def __init__( r_lower, r_upper = HL_PARAMS[severity_level]["smear_params"] self.smearer = Smearer(r_lower, r_upper, fs) - logging.info("Severity level - %s", severity_level) + if verbose: + logging.info("Severity level - %s", severity_level) def simulate(self, coch_sig: ndarray, equiv_0dB_file_SPL: float) -> ndarray: """Pass a signal through the cochlea. diff --git a/clarity/evaluator/msbg/msbg.py b/clarity/evaluator/msbg/msbg.py index 204144f0..5e6262d0 100644 --- a/clarity/evaluator/msbg/msbg.py +++ b/clarity/evaluator/msbg/msbg.py @@ -40,6 +40,7 @@ def __init__( sample_rate: float = 44100.0, equiv_0db_spl: float = 100.0, ahr: float = 20.0, + verbose: bool = False, ) -> None: """ Constructor for the Ear class. @@ -48,7 +49,9 @@ def __init__( sample_rate (float): sample frequency. equiv_0db_spl (): ??? ahr (): ??? + verbose (): ??? """ + self.verbose = verbose self.sample_rate = sample_rate self.src_correction = self.get_src_correction(src_pos) self.equiv_0db_spl = equiv_0db_spl @@ -62,7 +65,7 @@ def set_audiogram(self, audiogram: Audiogram) -> None: "Impairment too severe: Suggest you limit audiogram max to" "80-90 dB HL, otherwise things go wrong/unrealistic." ) - self.cochlea = Cochlea(audiogram=audiogram) + self.cochlea = Cochlea(audiogram=audiogram, verbose=self.verbose) @staticmethod def get_src_correction(src_pos: str) -> ndarray: @@ -92,6 +95,7 @@ def src_to_cochlea_filt( src_correction: ndarray, sample_rate: float, backward: bool = False, + verbose: bool = True, ) -> ndarray: """Simulate middle and outer ear transfer functions. @@ -109,12 +113,14 @@ def src_to_cochlea_filt( or ITU sample_rate (int): sampling frequency backward (bool, optional): if true then cochlea to src (default: False) + verbose (bool, optional): print verbose output (default: True) Returns: np.ndarray: the processed signal """ - logging.info("performing outer/middle ear corrections") + if verbose: + logging.info("performing outer/middle ear corrections") # make sure that response goes only up to sample_frequency/2 nyquist = int(sample_rate / 2.0) @@ -204,7 +210,8 @@ def process(self, signal: ndarray, add_calibration: bool = False) -> list[ndarra ) raise ValueError("Invalid sampling frequency, valid value is 44100") - logging.info("Processing {len(chans)} samples") + if self.verbose: + logging.info("Processing {len(chans)} samples") # Need to know file RMS, and then call that a certain level in SPL: # needs some form of pre-measuring. @@ -219,7 +226,7 @@ def process(self, signal: ndarray, add_calibration: bool = False) -> list[ndarra # Measure RMS where 3rd arg is dB_rel_rms (how far below) calculated_rms, idx, _rel_db_thresh, _active = measure_rms( - signal[0], sample_rate, -12 + signal[0], sample_rate, -12, verbose=self.verbose ) # Rescale input data and check level after rescaling @@ -229,11 +236,11 @@ def process(self, signal: ndarray, add_calibration: bool = False) -> list[ndarra new_rms_db = equiv_0db_spl + 10 * np.log10( np.mean(np.power(signal[0][idx], 2.0)) ) - logging.info( - "Rescaling: " - f"leveldBSPL was {level_db_spl:3.1f} dB SPL, now {new_rms_db:3.1f} dB SPL. " - f" Target SPL is {target_spl:3.1f} dB SPL." - ) + if self.verbose: + logging.info( + f"Rescaling: leveldBSPL was {level_db_spl:3.1f} dB SPL, now" + f" {new_rms_db:3.1f} dB SPL. Target SPL is {target_spl:3.1f} dB SPL." + ) # Add calibration signal at target SPL dB if add_calibration is True: @@ -247,11 +254,17 @@ def process(self, signal: ndarray, add_calibration: bool = False) -> list[ndarra signal = np.concatenate((pre_calibration, signal, post_calibration), axis=1) # Transform from src pos to cochlea, simulate cochlea, transform back to src pos - signal = Ear.src_to_cochlea_filt(signal, self.src_correction, sample_rate) + signal = Ear.src_to_cochlea_filt( + signal, self.src_correction, sample_rate, verbose=self.verbose + ) if self.cochlea is not None: signal = np.array([self.cochlea.simulate(x, equiv_0db_spl) for x in signal]) signal = Ear.src_to_cochlea_filt( - signal, self.src_correction, sample_rate, backward=True + signal, + self.src_correction, + sample_rate, + backward=True, + verbose=self.verbose, ) # Implement low-pass filter at top end of audio range: flat to Cutoff freq, diff --git a/clarity/evaluator/msbg/msbg_utils.py b/clarity/evaluator/msbg/msbg_utils.py index 09704d79..f5c387b3 100644 --- a/clarity/evaluator/msbg/msbg_utils.py +++ b/clarity/evaluator/msbg/msbg_utils.py @@ -358,6 +358,7 @@ def generate_key_percent( threshold_db: float, window_length: int, percent_to_track: float | None = None, + verbose: bool = False, ) -> tuple[ndarray, float]: """Generate key percent. Locates frames above some energy threshold or tracks a certain percentage @@ -370,6 +371,7 @@ def generate_key_percent( window_length (int): length of window in samples. percent_to_track (float, optional): Track a percentage of frames. Default is None + verbose (bool, optional): Print verbose output. Default is False. Raises: ValueError: percent_to_track is set too high. @@ -393,10 +395,11 @@ def generate_key_percent( expected = threshold_db # new Dec 2003. Possibly track percentage of frames rather than fixed threshold - if percent_to_track is not None: - logging.info("tracking %s percentage of frames", percent_to_track) - else: - logging.info("tracking fixed threshold") + if verbose: + if percent_to_track is not None: + logging.info("tracking %s percentage of frames", percent_to_track) + else: + logging.info("tracking fixed threshold") # put floor into histogram distribution non_zero = np.power(10, (expected - 30) / 10) @@ -466,6 +469,7 @@ def measure_rms( sample_rate: float, db_rel_rms: float, percent_to_track: float | None = None, + verbose=False, ) -> tuple[float, ndarray, float, float]: """Measure Root Mean Square. @@ -481,6 +485,7 @@ def measure_rms( db_rel_rms (float): threshold for frames to track. percent_to_track (float, optional): track percentage of frames, rather than threshold (default: {None}) + verbose (bool, optional): Print verbose output. Default is False. Returns: (tuple): tuple containing - rms (float): overall calculated rms (linear) @@ -500,6 +505,7 @@ def measure_rms( key_thr_db, round(WIN_SECS * sample_rate), percent_to_track=percent_to_track, + verbose=verbose, ) idx = key.astype(int) # move into generate_key_percent From 30c14b866e49178f8043b1fd3a8693b62f38d410 Mon Sep 17 00:00:00 2001 From: Gerardo Roa Date: Wed, 9 Oct 2024 12:42:23 +0100 Subject: [PATCH 03/10] add lyrics normalisation Signed-off-by: Gerardo Roa --- recipes/cad2/task1/baseline/evaluate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/recipes/cad2/task1/baseline/evaluate.py b/recipes/cad2/task1/baseline/evaluate.py index 0112d09b..5ae81977 100644 --- a/recipes/cad2/task1/baseline/evaluate.py +++ b/recipes/cad2/task1/baseline/evaluate.py @@ -11,7 +11,7 @@ import pyloudnorm as pyln import torch.nn import whisper -from alt_eval import compute_metrics +from alt_eval import compute_metrics, normalize_lyrics from omegaconf import DictConfig from clarity.enhancer.multiband_compressor import MultibandCompressor @@ -93,6 +93,7 @@ def compute_intelligibility( ) reference = segment_metadata["text"] + reference = normalize_lyrics(reference) lyrics["reference"] = reference # Compute left ear @@ -106,10 +107,11 @@ def compute_intelligibility( sample_rate, ) hypothesis = scorer.transcribe(left_path.as_posix(), fp16=False)["text"] + hypothesis = normalize_lyrics(hypothesis) lyrics["hypothesis_left"] = hypothesis left_results = compute_metrics( - reference, hypothesis, languages="en", include_other=False + [reference], [hypothesis], languages="en", include_other=False ) # Compute right ear @@ -123,10 +125,11 @@ def compute_intelligibility( sample_rate, ) hypothesis = scorer.transcribe(right_path.as_posix(), fp16=False)["text"] + hypothesis = normalize_lyrics(hypothesis) lyrics["hypothesis_right"] = hypothesis right_results = compute_metrics( - reference, hypothesis, languages="en", include_other=False + [reference], [hypothesis], languages="en", include_other=False ) # Compute the average score for both ears From 4c698677ada89c16924f0af0f05a4c59b2c1a1a8 Mon Sep 17 00:00:00 2001 From: Gerardo Roa Date: Wed, 9 Oct 2024 12:43:32 +0100 Subject: [PATCH 04/10] add verbose false to msgb in CAD2 Signed-off-by: Gerardo Roa --- recipes/cad2/task1/baseline/evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/cad2/task1/baseline/evaluate.py b/recipes/cad2/task1/baseline/evaluate.py index 5ae81977..62b408f3 100644 --- a/recipes/cad2/task1/baseline/evaluate.py +++ b/recipes/cad2/task1/baseline/evaluate.py @@ -90,6 +90,7 @@ def compute_intelligibility( ear = Ear( equiv_0db_spl=equiv_0db_spl, sample_rate=sample_rate, + verbose=False, ) reference = segment_metadata["text"] From 58f37aae3cf21ff4520f46921938f9cabfa4ef92 Mon Sep 17 00:00:00 2001 From: Gerardo Roa Date: Fri, 11 Oct 2024 16:30:01 +0100 Subject: [PATCH 05/10] remove the alt-eval normalisation as it is redundant Signed-off-by: Gerardo Roa --- recipes/cad2/task1/baseline/evaluate.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/recipes/cad2/task1/baseline/evaluate.py b/recipes/cad2/task1/baseline/evaluate.py index 62b408f3..dea69bfc 100644 --- a/recipes/cad2/task1/baseline/evaluate.py +++ b/recipes/cad2/task1/baseline/evaluate.py @@ -11,7 +11,7 @@ import pyloudnorm as pyln import torch.nn import whisper -from alt_eval import compute_metrics, normalize_lyrics +from alt_eval import compute_metrics from omegaconf import DictConfig from clarity.enhancer.multiband_compressor import MultibandCompressor @@ -94,7 +94,6 @@ def compute_intelligibility( ) reference = segment_metadata["text"] - reference = normalize_lyrics(reference) lyrics["reference"] = reference # Compute left ear @@ -108,7 +107,6 @@ def compute_intelligibility( sample_rate, ) hypothesis = scorer.transcribe(left_path.as_posix(), fp16=False)["text"] - hypothesis = normalize_lyrics(hypothesis) lyrics["hypothesis_left"] = hypothesis left_results = compute_metrics( @@ -126,7 +124,6 @@ def compute_intelligibility( sample_rate, ) hypothesis = scorer.transcribe(right_path.as_posix(), fp16=False)["text"] - hypothesis = normalize_lyrics(hypothesis) lyrics["hypothesis_right"] = hypothesis right_results = compute_metrics( From 3325779094b6154d34f1809f64bccbdb07ddefcc Mon Sep 17 00:00:00 2001 From: Gerardo Roa Date: Tue, 15 Oct 2024 14:49:29 +0100 Subject: [PATCH 06/10] verbose True by default Signed-off-by: Gerardo Roa --- clarity/evaluator/msbg/msbg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clarity/evaluator/msbg/msbg.py b/clarity/evaluator/msbg/msbg.py index 5e6262d0..fabd943f 100644 --- a/clarity/evaluator/msbg/msbg.py +++ b/clarity/evaluator/msbg/msbg.py @@ -40,7 +40,7 @@ def __init__( sample_rate: float = 44100.0, equiv_0db_spl: float = 100.0, ahr: float = 20.0, - verbose: bool = False, + verbose: bool = True, ) -> None: """ Constructor for the Ear class. From 86a91e87dbff1484ec4784582615e0b1a5e70a54 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:57:53 +0000 Subject: [PATCH 07/10] [pre-commit.ci] Fixing issues with pre-commit --- clarity/evaluator/msbg/cochlea.py | 1 - 1 file changed, 1 deletion(-) diff --git a/clarity/evaluator/msbg/cochlea.py b/clarity/evaluator/msbg/cochlea.py index 63c74808..10ff5cf6 100644 --- a/clarity/evaluator/msbg/cochlea.py +++ b/clarity/evaluator/msbg/cochlea.py @@ -15,7 +15,6 @@ from clarity.evaluator.msbg.smearing import Smearer from clarity.utils.audiogram import Audiogram - # TODO: Fix power overflow error when (expansion_ratios[ixch] - 1) < 0 From 3462e62ff240e9b63b805918daff8f7dea56c2f7 Mon Sep 17 00:00:00 2001 From: Gerardo Roa Date: Tue, 15 Oct 2024 14:59:57 +0100 Subject: [PATCH 08/10] add alt-eval into requirements Signed-off-by: Gerardo Roa --- recipes/cad2/task1/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/cad2/task1/requirements.txt b/recipes/cad2/task1/requirements.txt index 12ce2cb5..9c6fb928 100644 --- a/recipes/cad2/task1/requirements.txt +++ b/recipes/cad2/task1/requirements.txt @@ -1,4 +1,4 @@ huggingface-hub -jiwer +alt-eval openai-whisper safetensors From 7076acf7f49fabfe392be10a72423e15f9dd8e09 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:01:07 +0000 Subject: [PATCH 09/10] [pre-commit.ci] Fixing issues with pre-commit --- recipes/cad2/task1/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/cad2/task1/requirements.txt b/recipes/cad2/task1/requirements.txt index 9c6fb928..14359e1f 100644 --- a/recipes/cad2/task1/requirements.txt +++ b/recipes/cad2/task1/requirements.txt @@ -1,4 +1,4 @@ -huggingface-hub alt-eval +huggingface-hub openai-whisper safetensors From 22289f77d2faec775ed487f31ec1370900e60e06 Mon Sep 17 00:00:00 2001 From: Gerardo Roa Date: Fri, 18 Oct 2024 15:52:28 +0100 Subject: [PATCH 10/10] Update the evaluate task 1 Signed-off-by: Gerardo Roa --- recipes/cad2/task1/baseline/evaluate.py | 73 ++++++++++++++++++++----- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/recipes/cad2/task1/baseline/evaluate.py b/recipes/cad2/task1/baseline/evaluate.py index dea69bfc..3749e181 100644 --- a/recipes/cad2/task1/baseline/evaluate.py +++ b/recipes/cad2/task1/baseline/evaluate.py @@ -2,6 +2,7 @@ from __future__ import annotations +import hashlib import json import logging from pathlib import Path @@ -25,6 +26,17 @@ logger = logging.getLogger(__name__) +def set_song_seed(song: str) -> None: + """Set a seed that is unique for the given song""" + song_encoded = hashlib.md5(song.encode("utf-8")).hexdigest() + song_md5 = int(song_encoded, 16) % (10**8) + np.random.seed(song_md5) + + torch.manual_seed(song_md5) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(song_md5) + + def make_scene_listener_list(scenes_listeners: dict, small_test: bool = False) -> list: """Make the list of scene-listener pairing to process @@ -101,12 +113,14 @@ def compute_intelligibility( enhanced_left = ear.process(enhanced_signal[:, 0])[0] left_path = Path(f"{path_intermediate.as_posix()}_left.flac") save_flac_signal( - enhanced_signal, + enhanced_left, left_path, 44100, sample_rate, ) - hypothesis = scorer.transcribe(left_path.as_posix(), fp16=False)["text"] + hypothesis = scorer.transcribe(left_path.as_posix(), fp16=False, temperature=0.0)[ + "text" + ] lyrics["hypothesis_left"] = hypothesis left_results = compute_metrics( @@ -118,12 +132,14 @@ def compute_intelligibility( enhanced_right = ear.process(enhanced_signal[:, 1])[0] right_path = Path(f"{path_intermediate.as_posix()}_right.flac") save_flac_signal( - enhanced_signal, + enhanced_right, right_path, 44100, sample_rate, ) - hypothesis = scorer.transcribe(right_path.as_posix(), fp16=False)["text"] + hypothesis = scorer.transcribe(right_path.as_posix(), fp16=False, temperature=0.0)[ + "text" + ] lyrics["hypothesis_right"] = hypothesis right_results = compute_metrics( @@ -160,9 +176,23 @@ def compute_quality( reference_signal: np.ndarray, enhanced_signal: np.ndarray, listener: Listener, - config: DictConfig, + reference_sample_rate: int, + enhanced_sample_rate: int, + HAAQI_sample_rate: int, ) -> tuple[float, float]: - """Compute the HAAQI score for the left and right channels""" + """Compute the HAAQI score for the left and right channels + + Args: + reference_signal: The reference signal + enhanced_signal: The enhanced signal + listener: The listener + reference_sample_rate: The sample rate of the reference signal + enhanced_sample_rate: The sample rate of the enhanced signal + HAAQI_sample_rate: The sample rate for the HAAQI computation + + Returns: + The HAAQI score for the left and right channels + """ scores = [] for channel in range(2): @@ -172,16 +202,16 @@ def compute_quality( s = compute_haaqi( processed_signal=resample( enhanced_signal[:, channel], - config.remix_sample_rate, - config.HAAQI_sample_rate, + enhanced_sample_rate, + HAAQI_sample_rate, ), reference_signal=resample( reference_signal[:, channel], - config.input_sample_rate, - config.HAAQI_sample_rate, + reference_sample_rate, + HAAQI_sample_rate, ), - processed_sample_rate=config.HAAQI_sample_rate, - reference_sample_rate=config.HAAQI_sample_rate, + processed_sample_rate=HAAQI_sample_rate, + reference_sample_rate=HAAQI_sample_rate, audiogram=audiogram, equalisation=2, level1=65 - 20 * np.log10(compute_rms(reference_signal[:, channel])), @@ -309,6 +339,11 @@ def run_compute_scores(config: DictConfig) -> None: sample_rate=config.input_sample_rate, ) + # Configure backend for determinism + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + # Load the Whisper model intelligibility_scorer = whisper.load_model(config.evaluate.whisper_version) # Loop over the scene-listener pairs @@ -322,6 +357,10 @@ def run_compute_scores(config: DictConfig) -> None: scene_id, listener_id = scene_listener_ids + # Set the random seed for the scene + if config.evaluate.set_random_seed: + set_song_seed(scene_id) + # Load scene details scene = scenes[scene_id] listener = listener_dict[listener_id] @@ -382,7 +421,15 @@ def run_compute_scores(config: DictConfig) -> None: # COMPUTE SCORES # Compute the HAAQI and Whisper scores - haaqi_scores = compute_quality(reference, enhanced_signal, listener, config) + haaqi_scores = compute_quality( + reference_signal=reference, + enhanced_signal=enhanced_signal, + listener=listener, + reference_sample_rate=config.input_sample_rate, + enhanced_sample_rate=config.remix_sample_rate, + HAAQI_sample_rate=config.HAAQI_sample_rate, + ) + whisper_left, whisper_right, lyrics_text = compute_intelligibility( enhanced_signal=enhanced_signal, segment_metadata=songs[scene["segment_id"]],