diff --git a/test/src/unittests/tonal/test_pitch2midi.py b/test/src/unittests/tonal/test_pitch2midi.py index e6f836ed9..3a3eaac2d 100644 --- a/test/src/unittests/tonal/test_pitch2midi.py +++ b/test/src/unittests/tonal/test_pitch2midi.py @@ -19,189 +19,191 @@ from essentia_test import * -from numpy import sin, pi, mean, random, array, float32 +from numpy import sin, pi, mean, random, array, float32, square class TestPitch2Midi(TestCase): - # def testEmpty(self): - # self.assertComputeFails(Pitch2Midi(), -1, 0) - - # def testZero(self): - # message_type, midi_note, time_compensation = Pitch2Midi()(0, 0) - # self.assertEqual(message_type, []) - # self.assertEqual(midi_note.tolist(), array([], dtype=float32).tolist()) - # self.assertEqual(time_compensation.tolist(), array([], dtype=float32).tolist()) - - # def testOnset(self): - # sample_rate = 44100 - # hop_size = 128 - # onset_compensation = 0.075 - # pitch = 440.0 - # nblocks_for_onset = round(onset_compensation / (hop_size / sample_rate)) - # pitches = [pitch] * nblocks_for_onset - # voicings = [1] * nblocks_for_onset - # expected_message_type = ["note_on"] - - # self.runTest(sample_rate, hop_size, pitches, voicings, expected_message_type) - - # def testUnvoicedFrame(self): - # sample_rate = 44100 - # hop_size = 128 - # onset_compensation = 0.075 - # minNoteChangePeriod = 0.03 - # nblocks_for_onset = round(onset_compensation / (hop_size / sample_rate)) - # nblocks_for_offset = round(minNoteChangePeriod / (hop_size / sample_rate)) + 1 - # pitches = ([440.0] * nblocks_for_onset) + ([0] * nblocks_for_offset) - # voicings = ([1] * nblocks_for_onset) + ([0] * nblocks_for_offset) - # expected_message_type = ["note_off"] - - # self.runTest(sample_rate, hop_size, pitches, voicings, expected_message_type) - - # def testOffset(self): - # sample_rate = 44100 - # hop_size = 128 - # onset_compensation = 0.075 - # min_occurrence_rate = 0.015 / 2 - # nblocks_for_onset = round(onset_compensation / (hop_size / sample_rate)) - # nblocks_for_offset = round(min_occurrence_rate / (hop_size / sample_rate)) - # midi_notes = [69, 70] - # pitches = [midi2hz(note) for note in midi_notes] - # pitches = ([pitches[0]] * nblocks_for_onset) + ( - # [pitches[1]] * nblocks_for_offset - # ) - # voicings = [1] * (nblocks_for_onset + nblocks_for_offset) - # expected_message_type = ["note_off", "note_on"] - - # self.runTest(sample_rate, hop_size, pitches, voicings, expected_message_type) - - # def testContinuousChromaticSequence(self): - # sample_rate = 44100 - # hop_size = 128 - # onset_compensation = 0.075 - # minNoteChangePeriod = 0.03 - # midi_buffer_duration = 0.015 - # min_occurrence_rate = 0.5 - # min_occurrence_period = midi_buffer_duration * min_occurrence_rate - # nblocks_for_onset = round(onset_compensation / (hop_size / sample_rate)) - # nblocks_for_offset = round(minNoteChangePeriod / (hop_size / sample_rate)) - # nblocks_for_transition = round(min_occurrence_period / (hop_size / sample_rate)) - # n_notes = 12 - # midi_notes = list(range(69, 69 + n_notes)) - # # print(midi_notes) - # pitches = [midi2hz(note) for note in midi_notes] - # pitch_list = list() - # for pitch in pitches: - # pitch_list += [pitch] * (nblocks_for_transition + nblocks_for_onset) - # pitch_list += [pitch] * (nblocks_for_offset + 1) - # voicings = [1] * n_notes * (nblocks_for_onset + nblocks_for_transition) - # voicings += [0] * (nblocks_for_offset + 2) - # # print(len(pitch_list), len(voicings)) - # expected_message_type = ["note_off"] - # self.runTest(sample_rate, hop_size, pitch_list, voicings, expected_message_type) - - # # TODO: write a test that loads some audio files with an specific note and process them with p2m - - # def runTest( - # self, - # sample_rate: int, - # hop_size: int, - # pitches: list, - # voicings: list, - # expected_value: int, - # ): - # p2m = Pitch2Midi(sampleRate=sample_rate, hopSize=hop_size) - # ( - # midi_notes, - # time_compensations, - # message_types, - # ) = ([] for i in range(3)) - - # for n, (pitch, voiced) in enumerate(zip(pitches, voicings)): - # message, midi_note, time_compensation = p2m(pitch, voiced) - # print(n, message, midi_note, time_compensation) - # message_types.append(message) - # midi_notes += [midi_note] - # time_compensations += [time_compensation] - # self.assertEqual(message_types[-1], expected_value) - - # def testARealCase(self): - - # frameSize = 8192 - # sample_rate = 44100 - # hopSize = 128 - # loudness_threshold = -50 - # pitch_confidence_threshold = 0.25 - # filename = join(testdata.audio_dir, "recorded", "sax-test_min.wav") - # if sys.platform == "darwin": - # import soundfile as sf - - # audio, _ = sf.read(filename, dtype="float32") - # audio = audio[:, 0] - # else: - # audio = MonoLoader(filename=filename, sampleRate=44100)() - # frames = FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize) - - # n_frames = (audio.shape[0] - (frameSize - hopSize)) / hopSize - # print(f"n_frames: {n_frames}") - # # pitchDetect = Audio2Pitch( - # # frameSize=frameSize, - # # sampleRate=sample_rate, - # # pitchConfidenceThreshold=0.15, - # # loudnessThreshold=loudness_threshold, - # # ) - # win = Windowing(normalized=False, zeroPhase=False) - # spec = Spectrum() - # pitchDetect = PitchYinFFT( - # frameSize=frameSize, - # sampleRate=sample_rate, - # weighting="custom", - # tolerance=1.0, - # ) - # rmsDetect = RMS() - - # n_outputs = len(pitchDetect.outputNames()) - # pitch, confidence = ([] for _ in range(n_outputs)) - # loudness = [] - # voiced = [] - # n = 0 - # for frame in frames: - # # print(n, end="\r") - # # print(frame.shape) - # # out = pitchDetect(frame) - # _spec = spec(win(frame)) - # f, conf = pitchDetect(_spec) - # pitch += [f] - # confidence += [conf] - # loudness += [rmsDetect(frame)] - # if ( - # amp2db(loudness[n]) > loudness_threshold - # and confidence[n] > pitch_confidence_threshold - # ): - # voiced += [1] - # else: - # voiced += [0] - # n += 1 - # # print(voiced) - - # # TODO: create an annotation file for assessing the note toggle events - # self.runTest(sample_rate, hopSize, pitch, voiced, "note-off") - # # expected_pitch = numpy.load(join(filedir(), "pitchyinfft/vignesh_pitch.npy")) - # # expected_conf = numpy.load( - # # join(filedir(), "pitchyinfft/vignesh_confidence.npy") - # # ) - # # expected_voiced = [1] * len(expected_pitch) - # # self.assertAlmostEqualVector(pitch, expected_pitch, 1e-6) - # # self.assertAlmostEqualVector(confidence, expected_conf, 5e-5) - # # self.assertAlmostEqualVector(voiced, expected_voiced) - - # TODO: create annotations for e-major example to be assessed + def testEmpty(self): + self.assertComputeFails(Pitch2Midi(), -1, 0) + + def testZero(self): + message_type, midi_note, time_compensation = Pitch2Midi()(0, 0) + self.assertEqual(message_type, []) + self.assertEqual(midi_note.tolist(), array([], dtype=float32).tolist()) + self.assertEqual(time_compensation.tolist(), array([], dtype=float32).tolist()) + + def testOnset(self): + sample_rate = 44100 + hop_size = 128 + onset_compensation = 0.075 + pitch = 440.0 + nblocks_for_onset = round(onset_compensation / (hop_size / sample_rate)) + pitches = [pitch] * nblocks_for_onset + voicings = [1] * nblocks_for_onset + expected_message_type = ["note_on"] + + self.runTest(sample_rate, hop_size, pitches, voicings, expected_message_type) + + def testUnvoicedFrame(self): + sample_rate = 44100 + hop_size = 128 + onset_compensation = 0.075 + minNoteChangePeriod = 0.03 + nblocks_for_onset = round(onset_compensation / (hop_size / sample_rate)) + nblocks_for_offset = round(minNoteChangePeriod / (hop_size / sample_rate)) + 1 + pitches = ([440.0] * nblocks_for_onset) + ([0] * nblocks_for_offset) + voicings = ([1] * nblocks_for_onset) + ([0] * nblocks_for_offset) + expected_message_type = ["note_off"] + + self.runTest(sample_rate, hop_size, pitches, voicings, expected_message_type) + + def testOffset(self): + sample_rate = 44100 + hop_size = 128 + onset_compensation = 0.075 + min_occurrence_rate = 0.015 / 2 + nblocks_for_onset = round(onset_compensation / (hop_size / sample_rate)) + nblocks_for_offset = round(min_occurrence_rate / (hop_size / sample_rate)) + midi_notes = [69, 70] + pitches = [midi2hz(note) for note in midi_notes] + pitches = ([pitches[0]] * nblocks_for_onset) + ( + [pitches[1]] * nblocks_for_offset + ) + voicings = [1] * (nblocks_for_onset + nblocks_for_offset) + expected_message_type = ["note_off", "note_on"] + + self.runTest(sample_rate, hop_size, pitches, voicings, expected_message_type) + + def testContinuousChromaticSequence(self): + sample_rate = 44100 + hop_size = 128 + onset_compensation = 0.075 + minNoteChangePeriod = 0.03 + midi_buffer_duration = 0.015 + min_occurrence_rate = 0.5 + min_occurrence_period = midi_buffer_duration * min_occurrence_rate + nblocks_for_onset = round(onset_compensation / (hop_size / sample_rate)) + nblocks_for_offset = round(minNoteChangePeriod / (hop_size / sample_rate)) + nblocks_for_transition = round(min_occurrence_period / (hop_size / sample_rate)) + n_notes = 12 + midi_notes = list(range(69, 69 + n_notes)) + # print(midi_notes) + pitches = [midi2hz(note) for note in midi_notes] + pitch_list = list() + for pitch in pitches: + pitch_list += [pitch] * (nblocks_for_transition + nblocks_for_onset) + pitch_list += [pitch] * (nblocks_for_offset + 1) + voicings = [1] * n_notes * (nblocks_for_onset + nblocks_for_transition) + voicings += [0] * (nblocks_for_offset + 2) + # print(len(pitch_list), len(voicings)) + expected_message_type = ["note_off"] + self.runTest(sample_rate, hop_size, pitch_list, voicings, expected_message_type) + + def runTest( + self, + sample_rate: int, + hop_size: int, + pitches: list, + voicings: list, + expected_value: int, + expected_idx: int = -1, + ): + p2m = Pitch2Midi(sampleRate=sample_rate, hopSize=hop_size) + ( + midi_notes, + time_compensations, + message_types, + ) = ([] for i in range(3)) + + for n, (pitch, voiced) in enumerate(zip(pitches, voicings)): + message, midi_note, time_compensation = p2m(pitch, voiced) + # print(n, message, midi_note, time_compensation) + message_types.append(message) + midi_notes += [midi_note] + time_compensations += [time_compensation] + self.assertEqual(message_types[expected_idx], expected_value) + + def testARealCase(self): + + frameSize = 8192 + sample_rate = 44100 + hopSize = 128 + loudness_threshold = -50 + pitch_confidence_threshold = 0.25 + + filename = join(testdata.audio_dir, "recorded", "sax-test_min.wav") + if sys.platform == "darwin": + import soundfile as sf + + audio, _ = sf.read(filename, dtype="float32") + audio = audio[:, 0] + else: + audio = MonoLoader(filename=filename, sampleRate=44100)() + frames = FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize) + + n_frames = (audio.shape[0] - (frameSize - hopSize)) / hopSize + + # print(f"n_frames: {n_frames}") + # pitchDetect = Audio2Pitch( + # frameSize=frameSize, + # sampleRate=sample_rate, + # pitchConfidenceThreshold=0.15, + # loudnessThreshold=loudness_threshold, + # ) + + win = Windowing(normalized=False, zeroPhase=False) + spec = Spectrum() + pitchDetect = PitchYinFFT( + frameSize=frameSize, + sampleRate=sample_rate, + weighting="custom", + tolerance=1.0, + ) + rmsDetect = RMS() + + n_outputs = len(pitchDetect.outputNames()) + pitch, confidence = ([] for _ in range(n_outputs)) + loudness = [] + voiced = [] + n = 0 + for frame in frames: + # print(n, end="\r") + # print(frame.shape) + # out = pitchDetect(frame) + _spec = spec(win(frame)) + f, conf = pitchDetect(_spec) + pitch += [f] + confidence += [conf] + loudness += [rmsDetect(frame)] + if ( + amp2db(loudness[n]) > loudness_threshold + and confidence[n] > pitch_confidence_threshold + ): + voiced += [1] + else: + voiced += [0] + n += 1 + + expected_value = ["note_off"] + expected_idx = 2869 + # TODO: create an annotation file for assessing the note toggle events + self.runTest(sample_rate, hopSize, pitch, voiced, expected_value, expected_idx) + # expected_pitch = numpy.load(join(filedir(), "pitchyinfft/vignesh_pitch.npy")) + # expected_conf = numpy.load( + # join(filedir(), "pitchyinfft/vignesh_confidence.npy") + # ) + # expected_voiced = [1] * len(expected_pitch) + # self.assertAlmostEqualVector(pitch, expected_pitch, 1e-6) + # self.assertAlmostEqualVector(confidence, expected_conf, 5e-5) + # self.assertAlmostEqualVector(voiced, expected_voiced) + def testARealCaseWithEMajorScale(self): frame_size = 8192 sample_rate = 48000 - hop_size = 128 - loudness_threshold = -70 + hop_size = 64 + loudness_threshold = -40 pitch_confidence_threshold = 0.25 - transposition = 2 + filename = join( testdata.audio_dir, "recorded", "359500__mtg__sax-tenor-e-major.wav" ) @@ -218,7 +220,6 @@ def testARealCaseWithEMajorScale(self): n_frames = (audio.shape[0] - (frame_size - hop_size)) / hop_size step_time = hop_size / sample_rate - print(f"n_frames: {n_frames}\nstep_time: {step_time}[s]") # initialize audio2pitch & pitch2midi instances pitchDetect = Audio2Pitch( @@ -233,55 +234,59 @@ def testARealCaseWithEMajorScale(self): hopSize=hop_size, midiBufferDuration=0.05, ) - ( - midi_notes, - time_compensations, - message_types, - ) = ([] for i in range(3)) - # define some bins and process - n_outputs = len(pitchDetect.outputNames()) - print(pitchDetect.outputNames()) - print(f"n_outputs: {n_outputs}") - pitch, confidence, loudness, voiced = ([] for _ in range(n_outputs)) + # define estimate bin and some counters + estimated = [] n = 0 time_stamp = 0 n_notes = 0 - print(p2m.parameterNames()) # simulates real-time process for frame in frames: - # print(n, end="\r") - # print(frame.shape) _pitch, _confidence, _loudness, _voiced = pitchDetect(frame) message, midi_note, time_compensation = p2m(_pitch, _voiced) time_stamp += step_time - # print(n, time_stamp, _pitch, _confidence, _loudness, _voiced) if _voiced: if message: + estimated.append( + [ + n_notes, + time_stamp - time_compensation[1], + time_stamp - time_compensation[0], + int(midi_note[1]), + ] + ) + # print( + # f"[{n_notes}][{n}]:{(time_stamp-time_compensation[1]):.3f}, {midi2note(int(midi_note[1]))}({int(midi_note[1])}):{_pitch:.2f}, {time_compensation}, {midi_note}, {message}" + # ) if "note_on" in message: n_notes += 1 - print( - f"[{n_notes}][{n}]:{time_stamp-time_compensation[1]}, {midi2note(int(midi_note[1]))}({int(midi_note[1])}):{_pitch:.3f}, {time_compensation}, {midi_note}, {message}" - ) - # else: - # print( - # f"--[{n_notes}][{n}]:{time_stamp-time_compensation[1]}, {midi2note(int(midi_note[1]))}({int(midi_note[1])}):{_pitch:.3f}, {time_compensation}, {midi_note}" - # ) - # else: - # print(message, midi_note) - # pitch += [f] - # confidence += [conf] - # loudness += [rmsDetect(frame)] - # if ( - # amp2db(loudness[n]) > loudness_threshold - # and confidence[n] > pitch_confidence_threshold - # ): - # voiced += [1] - # else: - # voiced += [0] n += 1 + ## convert note toggle messages to note features + expected_notes = numpy.load( + join(filedir(), "pitch2midi/359500__mtg__sax-tenor-e-major.npy") + ) + + # estimate the number of notes for expected and detected + n_detected_notes = len(estimated) + n_expected_notes = len(expected_notes) + + # estimate the onset error for each note and estimate the mean + onset_mse = mean( + [square(note[1] - estimated[int(note[0])][1]) for note in expected_notes] + ) + + # estimate the midi note error for each note and estimate the mean + midi_note_mse = mean( + [square(note[-1] - estimated[int(note[0])][-1]) for note in expected_notes] + ) + + # assert outputs + self.assertAlmostEqual(onset_mse, 0, 0.01) + self.assertAlmostEqual(n_detected_notes, n_expected_notes, 0) + self.assertAlmostEqual(midi_note_mse, midi_note_mse, 0) + suite = allTests(TestPitch2Midi)