From 1a6063a4e73ff70257dfdfa2e754da74dc76816a Mon Sep 17 00:00:00 2001 From: Xavier Lizarraga Date: Fri, 30 Aug 2024 15:36:28 +0200 Subject: [PATCH] Add first implementation and unitests of Audio2Midi algo --- src/algorithms/tonal/audio2midi.cpp | 143 +++++++++ src/algorithms/tonal/audio2midi.h | 104 +++++++ test/src/unittests/tonal/test_audio2midi.py | 306 ++++++++++++++++++++ 3 files changed, 553 insertions(+) create mode 100644 src/algorithms/tonal/audio2midi.cpp create mode 100644 src/algorithms/tonal/audio2midi.h create mode 100644 test/src/unittests/tonal/test_audio2midi.py diff --git a/src/algorithms/tonal/audio2midi.cpp b/src/algorithms/tonal/audio2midi.cpp new file mode 100644 index 000000000..d2bbd9d2a --- /dev/null +++ b/src/algorithms/tonal/audio2midi.cpp @@ -0,0 +1,143 @@ +#include "audio2midi.h" + +using namespace std; +using namespace essentia; +using namespace standard; + +const char *Audio2Midi::name = "Audio2Midi"; +const char *Audio2Midi::category = "Pitch"; +const char *Audio2Midi::description = DOC("Wrapper around Audio2Pitch and Pitch2Midi for real time application"); + +void Audio2Midi::configure() +{ + _sampleRate = parameter("sampleRate").toReal(); + // _frameSize = parameter("frameSize").toInt(); + _hopSize = parameter("hopSize").toInt(); + // _pitchAlgorithm = parameter("pitchAlgorithm").toString(); + // _loudnessAlgorithm = parameter("loudnessAlgorithm").toString(); + _minFrequency = parameter("minFrequency").toReal(); + _maxFrequency = parameter("maxFrequency").toReal(); + _tuningFrequency = parameter("tuningFrequency").toInt(); + _pitchConfidenceThreshold = parameter("pitchConfidenceThreshold").toReal(); + _loudnessThreshold = parameter("loudnessThreshold").toReal(); + _transposition = parameter("transpositionAmount").toInt(); + _minOcurrenceRate = parameter("minOcurrenceRate").toReal(); + _midiBufferDuration = parameter("midiBufferDuration").toReal(); + _minNoteChangePeriod = parameter("minNoteChangePeriod").toReal(); + _minOnsetCheckPeriod = parameter("minOnsetCheckPeriod").toReal(); + _minOffsetCheckPeriod = parameter("minOffsetCheckPeriod").toReal(); + + // define frameSize depending on sampleRate + switch (static_cast(_sampleRate)) { + case 16000: + _frameSize = 2048; + break; + case 24000: + _frameSize = 4096; + break; + case 44100: + _frameSize = _fixedFrameSize; + break; + case 48000: + _frameSize = _fixedFrameSize; + break; + default: + _frameSize = _fixedFrameSize; + } + + _applyTimeCompensation = parameter("applyTimeCompensation").toBool(); + + if (_frameSize > _sampleRate * 0.5) + { + throw EssentiaException("Sax2Pitch: Frame size cannot be higher than Nyquist frequency"); + } + + _lowpass->configure(INHERIT("sampleRate"), + "cutoffFrequency", 1000); + _framebuffer->configure("bufferSize", _frameSize); + _audio2pitch->configure(INHERIT("sampleRate"), + "frameSize", _frameSize, + "pitchAlgorithm", _pitchAlgorithm, + "minFrequency", _minFrequency, + "maxFrequency", _maxFrequency, + INHERIT("pitchConfidenceThreshold"), + INHERIT("loudnessThreshold")); + + _pitch2midi->configure(INHERIT("sampleRate"), + INHERIT("hopSize"), + INHERIT("minOcurrenceRate"), + INHERIT("applyTimeCompensation"), + "minOnsetCheckPeriod", _minOnsetCheckPeriod, + "minOffsetCheckPeriod", _minOffsetCheckPeriod, + "minNoteChangePeriod", _minNoteChangePeriod, + "midiBufferDuration", _midiBufferDuration, + "minFrequency", _minFrequency, + "tuningFrequency", _tuningFrequency, + "transpositionAmount", _transposition); +} + +void Audio2Midi::compute() +{ + // get ref to input + const std::vector &frame = _frame.get(); + Real& pitch = _pitch.get(); + Real& loudness = _loudness.get(); + vector& messageType = _messageType.get(); + vector& midiNoteNumber = _midiNoteNumber.get(); + vector& timeCompensation = _timeCompensation.get(); + + _lowpass->input("signal").set(frame); + _lowpass->output("signal").set(lpFrame); + + _framebuffer->input("frame").set(lpFrame); + _framebuffer->output("frame").set(analysisFrame); + + _audio2pitch->input("frame").set(analysisFrame); + _audio2pitch->output("pitch").set(pitch); + _audio2pitch->output("pitchConfidence").set(pitchConfidence); + _audio2pitch->output("loudness").set(loudness); + _audio2pitch->output("voiced").set(voiced); + + _pitch2midi->input("pitch").set(pitch); + _pitch2midi->input("voiced").set(voiced); + _pitch2midi->output("midiNoteNumber").set(midiNoteNumber); + _pitch2midi->output("timeCompensation").set(timeCompensation); + _pitch2midi->output("messageType").set(messageType); + + // E_INFO("\nsax2midi: algorithm inputs and outputs set"); + _lowpass->compute(); + // E_INFO("sax2midi: lp compute"); + _framebuffer->compute(); + // E_INFO("sax2midi: framebuffer compute"); + // std::cout << "frame: \n" << frame << "\nanalysisFrame: \n" << analysisFrame << "\n"; + _audio2pitch->compute(); + // E_INFO("sax2midi: a2p compute"); + _pitch2midi->compute(); + // E_INFO("sax2midi: p2m compute"); + + // TODO: assign outputs + + // set outputs + // get pitchMessage from log_message_formatter + /*pitchMessage = _formatter->pitch_loudness(midiNoteNumber, pitch, pitchConfidence, loudness); + + switch (messageType) + { + case 0: + noteOffMessage = _formatter->note_off(midiNoteNumber, offsetTimeCompensation); + break; + case 1: + noteOnMessage = _formatter->note_on(midiNoteNumber, pitch, pitchConfidence, onsetTimeCompensation); + break; + case 2: + noteOffMessage = _formatter->note_off(previousMidiNoteNumber, offsetTimeCompensation); + noteOnMessage = _formatter->note_on(midiNoteNumber, pitch, pitchConfidence, onsetTimeCompensation); + break; + default: + noteOnMessage = ""; + noteOffMessage = ""; + break; + }*/ + + // E_INFO("sax2midi compute is done"); +} diff --git a/src/algorithms/tonal/audio2midi.h b/src/algorithms/tonal/audio2midi.h new file mode 100644 index 000000000..14994cde1 --- /dev/null +++ b/src/algorithms/tonal/audio2midi.h @@ -0,0 +1,104 @@ +#ifndef ESSENTIA_AUDIO2MIDI_H +#define ESSENTIA_AUDIO2MIDI_H + +#include "algorithmfactory.h" + +namespace essentia { +namespace standard { + + class Audio2Midi : public Algorithm { + protected: + Input> _frame; + Output _pitch; + Output _loudness; + Output > _messageType; + Output > _midiNoteNumber; + Output > _timeCompensation; + + Algorithm* _lowpass; + Algorithm* _framebuffer; + Algorithm* _audio2pitch; + Algorithm* _pitch2midi; + + Real _sampleRate; + int _frameSize; + int _fixedFrameSize = 8192; + int _hopSize; + std::string _pitchAlgorithm = "pitchyinfft"; + std::string _loudnessAlgorithm = "rms"; + Real _minFrequency; + Real _maxFrequency; + int _tuningFrequency; + Real _pitchConfidenceThreshold, _loudnessThreshold, _minOcurrenceRate; + Real _midiBufferDuration; + Real _minNoteChangePeriod; + Real _minOnsetCheckPeriod; + Real _minOffsetCheckPeriod; + + bool _applyTimeCompensation; + int _transposition; + + // Containers + std::vector lpFrame, analysisFrame; + Real pitch, pitchConfidence, loudness; + std::vector midiNoteNumber, timeCompensation; + std::vector messageType; + Real onsetTimeCompensation, offsetTimeCompensation; + + int voiced; + + public: + Audio2Midi() { + declareInput(_frame, "frame", "the input frame to analyse"); + declareOutput(_pitch, "pitch", "pitch given in Hz"); + declareOutput(_loudness, "loudness", "detected loudness in decibels"); + declareOutput(_messageType, "messageType", "the output of MIDI message type, as string, {noteoff, noteon, noteoff-noteon}"); + declareOutput(_midiNoteNumber, "midiNoteNumber", "the output of detected MIDI note number, as integer, in range [0,127]"); + declareOutput(_timeCompensation, "timeCompensation", "time to be compensated in the messages"); + + _lowpass = AlgorithmFactory::create("LowPass"); + _framebuffer = AlgorithmFactory::create("FrameBuffer"); + _audio2pitch = AlgorithmFactory::create("Audio2Pitch"); + _pitch2midi = AlgorithmFactory::create("Pitch2Midi"); + } + + ~Audio2Midi() { + delete _lowpass; + delete _framebuffer; + delete _audio2pitch; + delete _pitch2midi; + } + + void declareParameters() { + // TODO: revise parameter description + declareParameter("sampleRate", "sample rate of incoming audio frames", "[8000,inf)", 44100); + declareParameter("hopSize", "equivalent to I/O buffer size", "[1,inf)", 32); + // declareParameter("pitchAlgorithm", "pitch algorithm to use", "{pyin,pyin_fft}", "pyin_fft"); + // declareParameter("loudnessAlgorithm", "loudness algorithm to use", "{loudness,rms}", "rms"); + declareParameter("minFrequency", "minimum frequency to detect in Hz", "[10,20000]", 60.0); + declareParameter("maxFrequency", "maximum frequency to detect in Hz", "[10,20000]", 2300.0); + declareParameter("tuningFrequency", "tuning frequency for semitone index calculation, corresponding to A3 [Hz]", "{432,440}", 440); + declareParameter("pitchConfidenceThreshold", "level of pitch confidence above which note ON/OFF start to be considered", "[0,1]", 0.25); + declareParameter("loudnessThreshold", "loudness level above/below which note ON/OFF start to be considered, in decibels", "[-inf,0]", -51.0); + declareParameter("transpositionAmount", "Apply transposition (in semitones) to the detected MIDI notes.", "(-69,50)", 0); + declareParameter("minOcurrenceRate", "rate of predominant pitch ocurrence in MidiPool buffer to consider note ON event", "[0,1]", 0.5); + declareParameter("midiBufferDuration", "duration in seconds of buffer used for voting in MidiPool algorithm", "[0.005,0.5]", 0.05); // 15ms + declareParameter("minNoteChangePeriod", "minimum time to wait until a note change is detected (testing only)", "(0,1]", 0.030); + declareParameter("minOnsetCheckPeriod", "minimum time to wait until an onset is detected (testing only)", "(0,1]", 0.075); + declareParameter("minOffsetCheckPeriod", "minimum time to wait until an offset is detected (testing only)", "(0,1]", 0.2); + declareParameter("applyTimeCompensation", "whether to apply time compensation correction to MIDI note detection", "{true,false}", true); + } + + void configure(); + void compute(); + + static const char* name; + static const char* category; + static const char* description; + }; + + +} // namespace standard +} // namespace essentia + +#endif diff --git a/test/src/unittests/tonal/test_audio2midi.py b/test/src/unittests/tonal/test_audio2midi.py new file mode 100644 index 000000000..3fc9d3c8a --- /dev/null +++ b/test/src/unittests/tonal/test_audio2midi.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python + +# Copyright (C) 2006-2024 Music Technology Group - Universitat Pompeu Fabra +# +# This file is part of Essentia +# +# Essentia is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation (FSF), either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the Affero GNU General Public License +# version 3 along with this program. If not, see http://www.gnu.org/licenses/ + + +from essentia_test import * +from numpy import mean, array, float32, square +from pathlib import Path + + +class TestAudio2Midi(TestCase): + def testEmpty(self): + self.assertComputeFails(Audio2Midi(), []) + + def testZero(self): + pitch, loudness, message_type, midi_note, time_compensation = Audio2Midi()( + zeros(32) + ) + self.assertEqual(pitch, 0) + self.assertEqual(loudness, 0.0) + self.assertEqual(message_type, []) + self.assertEqual(midi_note.tolist(), array([0.0, 0.0], dtype=float32).tolist()) + self.assertEqual( + time_compensation.tolist(), array([0.0, 0.0], dtype=float32).tolist() + ) + + def assessNoteList( + self, + reference_path: str, + estimated: list, + n_notes_tolerance: int = 0, + onset_tolerance: float = 0.01, + offset_tolerance: float = 0.01, + midi_note_tolerance: int = 0, + ): + # read the expected notes file manually annotated + expected_notes = numpy.load(join(filedir(), reference_path)) + print("Expected notes:") + print(expected_notes) + + print("\ndiffs") + print(array(estimated) - expected_notes[:, 1:]) + + # estimate the number of notes for expected and detected + n_detected_notes = len(estimated) + n_expected_notes = len(expected_notes) + + # estimate the onset error for each note and estimate the mean + onset_mse = mean( + [square(note[1] - estimated[int(note[0])][0]) for note in expected_notes] + ) + + # estimate the onset error for each note and estimate the mean + offset_mse = mean( + [square(note[2] - estimated[int(note[0])][1]) for note in expected_notes] + ) + + # estimate the midi note error for each note and estimate the mean + midi_note_mse = mean( + [square(note[-1] - estimated[int(note[0])][-1]) for note in expected_notes] + ) + + # assert outputs + self.assertAlmostEqual(n_detected_notes, n_expected_notes, n_notes_tolerance) + self.assertAlmostEqual(onset_mse, 0, onset_tolerance) + self.assertAlmostEqual(offset_mse, 0, offset_tolerance) + self.assertAlmostEqual(midi_note_mse, midi_note_mse, midi_note_tolerance) + + def testARealCaseWithEMajorScale(self): + frame_size = 8192 + sample_rate = 48000 + hop_size = 64 + loudness_threshold = -40 + pitch_confidence_threshold = 0.25 + min_frequency = 103.83 + max_frequency = 659.26 + midi_buffer_duration = 0.05 + min_note_change_period = 0.03 + n_notes_tolerance = 0 + onset_tolerance = 0.01 + midi_note_tolerance = 0 + + stem = "359500__mtg__sax-tenor-e-major" + audio_path = Path("recorded") / f"{stem}.wav" + reference_path = Path("pitch2midi") / f"{stem}.npy" + + self.runARealCase( + audio_path=audio_path, + reference_path=reference_path, + sample_rate=sample_rate, + frame_size=frame_size, + hop_size=hop_size, + pitch_confidence_threshold=pitch_confidence_threshold, + loudness_threshold=loudness_threshold, + midi_buffer_duration=midi_buffer_duration, + min_note_change_period=min_note_change_period, + max_frequency=max_frequency, + min_frequency=min_frequency, + n_notes_tolerance=n_notes_tolerance, + onset_tolerance=onset_tolerance, + midi_note_tolerance=midi_note_tolerance, + ) + + def testARealCaseWithDMinorScale(self): + frame_size = 8192 + sample_rate = 48000 + hop_size = 64 + loudness_threshold = -40 + pitch_confidence_threshold = 0.25 + min_frequency = 103.83 + max_frequency = 659.26 + midi_buffer_duration = 0.05 + min_note_change_period = 0.03 + n_notes_tolerance = 0 + onset_tolerance = 0.01 + midi_note_tolerance = 0 + + stem = "359628__mtg__sax-tenor-d-minor" + audio_path = Path("recorded") / f"{stem}.wav" + reference_path = Path("pitch2midi") / f"{stem}.npy" + + self.runARealCase( + audio_path=audio_path, + reference_path=reference_path, + sample_rate=sample_rate, + frame_size=frame_size, + hop_size=hop_size, + pitch_confidence_threshold=pitch_confidence_threshold, + loudness_threshold=loudness_threshold, + midi_buffer_duration=midi_buffer_duration, + min_note_change_period=min_note_change_period, + max_frequency=max_frequency, + min_frequency=min_frequency, + n_notes_tolerance=n_notes_tolerance, + onset_tolerance=onset_tolerance, + midi_note_tolerance=midi_note_tolerance, + ) + + def testSeparatedNotes(self): + frame_size = 8192 + sample_rate = 44100 + hop_size = 32 + loudness_threshold = -42 + pitch_confidence_threshold = 0.6 + min_frequency = 103.83 + max_frequency = 659.26 + midi_buffer_duration = 0.05 + min_note_change_period = 0.03 + min_offset_period = 0.1 + n_notes_tolerance = 0 + onset_tolerance = 0.01 + midi_note_tolerance = 0 + + stem = "387517__deleted_user_7267864__saxophone-going-up" + audio_path = Path("recorded") / f"{stem}.wav" + reference_path = Path("pitch2midi") / f"{stem}.npy" + + self.runARealCase( + audio_path=audio_path, + reference_path=reference_path, + sample_rate=sample_rate, + frame_size=frame_size, + hop_size=hop_size, + pitch_confidence_threshold=pitch_confidence_threshold, + loudness_threshold=loudness_threshold, + max_frequency=max_frequency, + min_frequency=min_frequency, + midi_buffer_duration=midi_buffer_duration, + min_note_change_period=min_note_change_period, + min_offset_period=min_offset_period, + n_notes_tolerance=n_notes_tolerance, + onset_tolerance=onset_tolerance, + midi_note_tolerance=midi_note_tolerance, + ) + + def runARealCase( + self, + audio_path: str, + reference_path: str, + sample_rate: int, + frame_size: int, + hop_size: int, + pitch_confidence_threshold: float, + loudness_threshold: float, + max_frequency: float, + min_frequency: float, + midi_buffer_duration: float, + min_note_change_period: float, + min_offset_period: float = 0.2, + n_notes_tolerance: int = 0, + onset_tolerance: float = 0.01, + offset_tolerance: float = 0.05, + midi_note_tolerance: int = 0, + ): + filename = join(testdata.audio_dir, audio_path) + if sys.platform == "darwin": + import soundfile as sf + + audio, _ = sf.read(filename, dtype="float32") + if audio.ndim > 1: + audio = audio[:, 0] + else: + audio = MonoLoader(filename=filename, sampleRate=sample_rate)() + frames = FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size) + step_time = hop_size / sample_rate + + # initialize audio2midi instance + a2m = Audio2Midi( + sampleRate=sample_rate, + hopSize=hop_size, + midiBufferDuration=midi_buffer_duration, + minNoteChangePeriod=min_note_change_period, + minOffsetCheckPeriod=min_offset_period, + pitchConfidenceThreshold=pitch_confidence_threshold, + loudnessThreshold=loudness_threshold, + maxFrequency=max_frequency, + minFrequency=min_frequency, + ) + print(a2m.parameterNames()) + + # define estimate bin and some counters + nte_list = [] # note toggle event list + n = 0 + time_stamp = 0 + n_notes = 0 + + # simulates real-time process + for frame in frames: + # _pitch, _, _, _voiced = pitchDetect(frame) + _pitch, _, message, midi_note, time_compensation = a2m(frame) + time_stamp += step_time + # print(n, time_stamp, message, midi_note, time_compensation) + if message: + nte_list.append( + [ + n_notes, + time_stamp - time_compensation[1], + time_stamp - time_compensation[0], + int(midi_note[1]), + message, + ] + ) + print( + f"[{n_notes}][{n}]:{(time_stamp-time_compensation[1]):.3f}, {midi2note(int(midi_note[1]))}({int(midi_note[1])})~{_pitch:.2f}Hz, {message}" # , {time_compensation}, {midi_note}, {message} + ) + if "note_on" in message: + n_notes += 1 + n += 1 + + print(f"nte_list: {nte_list}") + # from the nte_list extracts the note list using note_off messages + note_list = self.ntes_to_notes(nte_list) + print(f"note_list: {note_list}") + + self.assessNoteList( + reference_path, + note_list, + n_notes_tolerance=n_notes_tolerance, + onset_tolerance=onset_tolerance, + offset_tolerance=offset_tolerance, + midi_note_tolerance=midi_note_tolerance, + ) + + def ntes_to_notes(self, nte_list: list): + note_list = list() + for n, nte_message in enumerate(nte_list): + if "note_on" in nte_message[4]: + # extract time stamp + start_time = nte_message[1] + + # in some cases the compensation might generate negative values + if start_time < 0: + start_time = 0 + + # to get the note offset it is need to get time stamps in the next message (note-off) + if n + 1 < len(nte_list): # when a note off message is provided + # define timestamp for offset + end_time = nte_list[n + 1][1] + else: # there is a non-closed note at the end + # define timestamp for offset + end_time = nte_list[-1][1] + note = int(nte_message[3]) + # define annotation in a list + note_list.append([float(start_time), float(end_time), note]) + return note_list + + +suite = allTests(TestAudio2Midi) + +if __name__ == "__main__": + TextTestRunner(verbosity=2).run(suite)