From 408466996576a3be98a0da5d803139bae269633b Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Wed, 17 May 2023 15:13:07 -0700 Subject: [PATCH 01/35] Cut branch r1.19.0 Signed-off-by: smajumdar --- Jenkinsfile | 330 +++++++++--------- README.rst | 60 ++-- nemo/package_info.py | 2 +- tutorials/00_NeMo_Primer.ipynb | 8 +- tutorials/01_NeMo_Models.ipynb | 2 +- tutorials/02_NeMo_Adapters.ipynb | 2 +- tutorials/AudioTranslationSample.ipynb | 4 +- ...blish_NeMo_Model_On_Hugging_Face_Hub.ipynb | 2 +- tutorials/VoiceSwapSample.ipynb | 4 +- .../asr/ASR_CTC_Language_Finetuning.ipynb | 4 +- .../ASR_Example_CommonVoice_Finetuning.ipynb | 10 +- tutorials/asr/ASR_for_telephony_speech.ipynb | 2 +- tutorials/asr/ASR_with_NeMo.ipynb | 4 +- .../asr/ASR_with_Subword_Tokenization.ipynb | 2 +- tutorials/asr/ASR_with_Transducers.ipynb | 2 +- .../asr/Buffered_Transducer_Inference.ipynb | 2 +- ..._Transducer_Inference_with_LCS_Merge.ipynb | 2 +- tutorials/asr/Intro_to_Transducers.ipynb | 4 +- tutorials/asr/Multilang_ASR.ipynb | 8 +- tutorials/asr/Offline_ASR.ipynb | 4 +- .../Offline_ASR_with_VAD_for_CTC_models.ipynb | 2 +- .../asr/Online_ASR_Microphone_Demo.ipynb | 2 +- tutorials/asr/Online_Noise_Augmentation.ipynb | 2 +- .../Online_Offline_Microphone_VAD_Demo.ipynb | 4 +- .../Online_Offline_Speech_Commands_Demo.ipynb | 4 +- .../asr/Self_Supervised_Pre_Training.ipynb | 10 +- tutorials/asr/Speech_Commands.ipynb | 2 +- tutorials/asr/Streaming_ASR.ipynb | 4 +- tutorials/asr/Voice_Activity_Detection.ipynb | 2 +- .../asr/asr_adapters/ASR_with_Adapters.ipynb | 4 +- ...netuning_at_Scale_with_AWS_SageMaker.ipynb | 12 +- .../cloud/aws/SageMaker_ASR_Training.ipynb | 6 +- ...Language_Models_for_Downstream_Tasks.ipynb | 12 +- tutorials/nlp/02_NLP_Tokenizers.ipynb | 4 +- ...a_Preprocessing_and_Cleaning_for_NMT.ipynb | 6 +- tutorials/nlp/Dialogue.ipynb | 2 +- tutorials/nlp/Entity_Linking_Medical.ipynb | 4 +- tutorials/nlp/GLUE_Benchmark.ipynb | 2 +- tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb | 12 +- ...Joint_Intent_and_Slot_Classification.ipynb | 2 +- tutorials/nlp/MegatronBert_export.ipynb | 4 +- ...on_Synthetic_Tabular_Data_Generation.ipynb | 2 +- .../nlp/Multitask_Prompt_and_PTuning.ipynb | 8 +- .../nlp/Punctuation_and_Capitalization.ipynb | 8 +- ...ion_and_Capitalization_Lexical_Audio.ipynb | 8 +- tutorials/nlp/Question_Answering.ipynb | 2 +- .../nlp/Relation_Extraction-BioMegatron.ipynb | 2 +- ...xt_Classification_Sentiment_Analysis.ipynb | 2 +- .../Token_Classification-BioMegatron.ipynb | 2 +- ...ssification_Named_Entity_Recognition.ipynb | 4 +- .../nlp/Zero_Shot_Intent_Recognition.ipynb | 4 +- .../ASR_with_SpeakerDiarization.ipynb | 6 +- .../Speaker_Diarization_Inference.ipynb | 12 +- .../Speaker_Diarization_Training.ipynb | 8 +- .../Speaker_Identification_Verification.ipynb | 8 +- .../tools/CTC_Segmentation_Tutorial.ipynb | 8 +- tutorials/tools/Multispeaker_Simulator.ipynb | 4 +- .../tts/Aligner_Inference_Examples.ipynb | 4 +- .../Evaluation_MelCepstralDistortion.ipynb | 6 +- .../tts/FastPitch_Adapter_Finetuning.ipynb | 4 +- .../tts/FastPitch_ChineseTTS_Training.ipynb | 8 +- tutorials/tts/FastPitch_Finetuning.ipynb | 4 +- .../tts/FastPitch_GermanTTS_Training.ipynb | 10 +- .../tts/FastPitch_MixerTTS_Training.ipynb | 2 +- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 4 +- .../tts/FastPitch_Speaker_Interpolation.ipynb | 2 +- .../tts/Inference_DurationPitchControl.ipynb | 4 +- tutorials/tts/Inference_ModelSelect.ipynb | 2 +- tutorials/tts/NeMo_TTS_Primer.ipynb | 2 +- .../tts/Pronunciation_customization.ipynb | 12 +- tutorials/tts/Tacotron2_Training.ipynb | 2 +- tutorials/tts/Vits_Training.ipynb | 2 +- 72 files changed, 358 insertions(+), 358 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index f43b301af..27fbf1114 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -85,8 +85,8 @@ pipeline { stage('L0: Unit Tests CPU') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } steps { @@ -97,8 +97,8 @@ pipeline { stage('L2: ASR dev run') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -183,8 +183,8 @@ pipeline { stage('L2: ASR dev run - part two') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -213,8 +213,8 @@ pipeline { stage('L2: Speech to Text EMA') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } steps { @@ -234,8 +234,8 @@ pipeline { stage('L2: Speaker dev run') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -357,8 +357,8 @@ pipeline { // stage('L2: ASR DALI dev run') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -425,8 +425,8 @@ pipeline { // stage('L2: ASR RNNT dev run') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -487,8 +487,8 @@ pipeline { // stage('L2: Hybrid ASR RNNT-CTC dev run') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -517,8 +517,8 @@ pipeline { stage('L2: ASR Multi-dataloader dev run') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -565,8 +565,8 @@ pipeline { stage('L2: ASR Adapters') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -610,8 +610,8 @@ pipeline { stage('L2: Megatron T5 Adapter PP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -656,8 +656,8 @@ pipeline { stage('L2: Megatron T5 Adapter TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -700,8 +700,8 @@ pipeline { stage('L2: Megatron T5 IA3 PP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -746,8 +746,8 @@ pipeline { stage('L2: Megatron T5 IA3 TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -790,8 +790,8 @@ pipeline { stage('L2: Megatron GPT Adapter TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -833,8 +833,8 @@ pipeline { stage('L2: Megatron GPT Adapter PP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -877,8 +877,8 @@ pipeline { stage('L2: Speech Transcription') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -898,8 +898,8 @@ pipeline { stage('L2: Transducer alignment') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -915,8 +915,8 @@ pipeline { stage('L2: Segmentation Tool') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } stages { @@ -971,8 +971,8 @@ pipeline { stage('L2: G2P Models') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1053,8 +1053,8 @@ pipeline { // stage('L2: Multi-GPU Megatron finetuning') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -1080,8 +1080,8 @@ pipeline { stage('L2: STS-b') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1140,8 +1140,8 @@ pipeline { stage('L2: Dialogue Classification') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1311,8 +1311,8 @@ pipeline { stage('L2: Dialogue Generation') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1377,8 +1377,8 @@ pipeline { // stage('L2: Dialogue Generation Part 2') { // when { // anyOf { -// branch 'main' -// changeRequest target: 'main' +// branch 'r1.19.0' +// changeRequest target: 'r1.19.0' // } // } // failFast true @@ -1407,8 +1407,8 @@ pipeline { stage('L2: COPY') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1437,8 +1437,8 @@ pipeline { stage('L2: Duplex Text Normalization') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1470,13 +1470,13 @@ pipeline { } } } - // Runs out of memory on the 12G TITAN V (GPU 0 on main CI) + // Runs out of memory on the 12G TITAN V (GPU 0 on r1.19.0 CI) // TODO: add when megatron bert is supported again in NeMo // stage('L2: MegaBERT Token Classification') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -1501,8 +1501,8 @@ pipeline { stage('L2: BERT Text Classification') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1530,8 +1530,8 @@ pipeline { stage('L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1589,8 +1589,8 @@ pipeline { stage('L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1650,8 +1650,8 @@ pipeline { stage('L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1711,8 +1711,8 @@ pipeline { stage('L2: Intent and Slot Classification Tasks') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1751,8 +1751,8 @@ pipeline { // stage('L2: Model Parallel Size 2 Megatron Text Classification') { // when { // anyOf{ - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -1780,8 +1780,8 @@ pipeline { // stage('L2: Model Parallel Size 2 Megatron Autoresume') { // when { // anyOf{ - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -1811,8 +1811,8 @@ pipeline { // stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') { // when { // anyOf{ - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -1832,8 +1832,8 @@ pipeline { // stage('L2: Model Parallel Size 2 Megatron Train from .nemo') { // when { // anyOf{ - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -1855,8 +1855,8 @@ pipeline { stage('L2: Parallel NLP Examples 2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -1980,8 +1980,8 @@ pipeline { stage('Punctuation & Capitalization tarred dataset') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2039,8 +2039,8 @@ pipeline { stage('Punctuation & Capitalization, Different ways of passing labels to model') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2147,8 +2147,8 @@ pipeline { stage('Punctuation & Capitalization inference') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2173,8 +2173,8 @@ pipeline { stage('L2: Parallel Pretraining BERT pretraining from Text/Preprocessed') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2235,8 +2235,8 @@ pipeline { stage('L2: Entity Linking') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2263,8 +2263,8 @@ pipeline { stage('L2: NMT Attention is All You Need Training') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2386,8 +2386,8 @@ pipeline { stage('L2: NMT Attention is All You Need Inference') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2422,8 +2422,8 @@ pipeline { stage('L2: NMT Attention is All You Need Finetuning') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2457,8 +2457,8 @@ pipeline { stage('L2: NMT Tarred Dataset Creation') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2511,8 +2511,8 @@ pipeline { stage('L2: Megatron NMT Training TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2610,8 +2610,8 @@ pipeline { // stage('L2: NMT Bottleneck Fallback') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -2657,8 +2657,8 @@ pipeline { // stage('L2: NMT Bottleneck Architecture') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -2740,8 +2740,8 @@ pipeline { // stage('L2: NMT Bottleneck LVM') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -2823,8 +2823,8 @@ pipeline { stage('L2: Megatron Bert Pretraining and Resume Training with Pipeline Paralleism') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2893,8 +2893,8 @@ pipeline { stage('L2: Megatron Bert Pretraining and Resume Training') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -2964,8 +2964,8 @@ pipeline { stage('L2: Megatron RETRO Pretraining and Resume Training') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3036,8 +3036,8 @@ pipeline { stage('L2: Megatron RETRO muTransfer Pretraining Performance') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3119,8 +3119,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: BioMegatron Bert NER Task') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3137,8 +3137,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron GPT Pretraining and Resume Training TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3221,8 +3221,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron GPT Pretraining and Resume Training PP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3305,8 +3305,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron GPT Finetuning PP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3373,8 +3373,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron GPT Eval') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3390,8 +3390,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron GPT Eval PP2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3409,8 +3409,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron GPT Prompt Tuning TP1 PP1') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3444,8 +3444,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron GPT Prompt Tuning TP2 PP1') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3488,8 +3488,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' // stage('L2: Megatron GPT Prompt Tuning TP1 PP2') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -3533,8 +3533,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' // stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -3560,8 +3560,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron Change Partitions') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3607,8 +3607,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 Pretraining and Resume Training TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3703,8 +3703,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 with ALiBi Pretraining and Resume Training TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3799,8 +3799,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3895,8 +3895,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 Pretraining and Resume Training PP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -3965,8 +3965,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 w/ Mixture of Expert Pretraining') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4010,8 +4010,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 Prompt Learning TP1 PP1') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4051,8 +4051,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 Prompt Learning TP2 PP1') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4096,8 +4096,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' // stage('L2: Megatron T5 Prompt Learning TP1 PP2') { // when { // anyOf { - // branch 'main' - // changeRequest target: 'main' + // branch 'r1.19.0' + // changeRequest target: 'r1.19.0' // } // } // failFast true @@ -4140,8 +4140,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron UL2 Pretraining and Resume Training TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4220,8 +4220,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 Eval') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4237,8 +4237,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron BART Pretraining and Resume Training, TP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4306,8 +4306,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron BART Pretraining and Resume Training, PP=2') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4379,8 +4379,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 GLUE/XNLI Finetuning') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4452,8 +4452,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron Mock Data Generation') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true @@ -4489,8 +4489,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: TTS Fast dev runs 1') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } parallel { @@ -4635,8 +4635,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L??: Speech Checkpoints tests') { when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'r1.19.0' + changeRequest target: 'r1.19.0' } } failFast true diff --git a/README.rst b/README.rst index 1335620ea..841509dfe 100644 --- a/README.rst +++ b/README.rst @@ -5,9 +5,9 @@ :target: http://www.repostatus.org/#active :alt: Project Status: Active – The project has reached a stable, usable state and is being actively developed. -.. |documentation| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=main +.. |documentation| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=r1.19.0 :alt: Documentation - :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/ + :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/ .. |license| image:: https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg :target: https://github.com/NVIDIA/NeMo/blob/master/LICENSE @@ -25,7 +25,7 @@ :target: https://pepy.tech/project/nemo-toolkit :alt: PyPi total downloads -.. |codeql| image:: https://github.com/nvidia/nemo/actions/workflows/codeql.yml/badge.svg?branch=main&event=push +.. |codeql| image:: https://github.com/nvidia/nemo/actions/workflows/codeql.yml/badge.svg?branch=r1.19.0&event=push :target: https://github.com/nvidia/nemo/actions/workflows/codeql.yml :alt: CodeQL @@ -33,7 +33,7 @@ :target: https://github.com/psf/black :alt: Code style: black -.. _main-readme: +.. _r1.19.0-readme: **NVIDIA NeMo** =============== @@ -61,7 +61,7 @@ We have extensive `tutorials `_. For advanced users that want to train NeMo models from scratch or finetune existing NeMo models -we have a full suite of `example scripts `_ that support multi-GPU/multi-node training. +we have a full suite of `example scripts `_ that support multi-GPU/multi-node training. For scaling NeMo LLM training on Slurm clusters or public clouds, please see the `NVIDIA NeMo Megatron Launcher `_. The NM launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and also has an `Autoconfigurator `_ @@ -74,7 +74,7 @@ Key Features * Speech processing * `HuggingFace Space for Audio Transcription (File, Microphone and YouTube) `_ - * `Automatic Speech Recognition (ASR) `_ + * `Automatic Speech Recognition (ASR) `_ * Supported ASR models: ``_ * Jasper, QuartzNet, CitriNet, ContextNet * Conformer-CTC, Conformer-Transducer, FastConformer-CTC, FastConformer-Transducer @@ -88,42 +88,42 @@ Key Features * Streaming/Buffered ASR (CTC/Transducer) - `Chunked Inference Examples `_ * Cache-aware Streaming Conformer - ``_ * Beam Search decoding - * `Language Modelling for ASR `_: N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer - * `Support of long audios for Conformer with memory efficient local attention `_ - * `Speech Classification, Speech Command Recognition and Language Identification `_: MatchboxNet (Command Recognition), AmberNet (LangID) + * `Language Modelling for ASR `_: N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer + * `Support of long audios for Conformer with memory efficient local attention `_ + * `Speech Classification, Speech Command Recognition and Language Identification `_: MatchboxNet (Command Recognition), AmberNet (LangID) * `Voice activity Detection (VAD) `_: MarbleNet * ASR with VAD Inference - `Example `_ - * `Speaker Recognition `_: TitaNet, ECAPA_TDNN, SpeakerNet - * `Speaker Diarization `_ + * `Speaker Recognition `_: TitaNet, ECAPA_TDNN, SpeakerNet + * `Speaker Diarization `_ * Clustering Diarizer: TitaNet, ECAPA_TDNN, SpeakerNet * Neural Diarizer: MSDD (Multi-scale Diarization Decoder) - * `Speech Intent Detection and Slot Filling `_: Conformer-Transformer + * `Speech Intent Detection and Slot Filling `_: Conformer-Transformer * `Pretrained models on different languages. `_: English, Spanish, German, Russian, Chinese, French, Italian, Polish, ... * `NGC collection of pre-trained speech processing models. `_ * Natural Language Processing * `NeMo Megatron pre-training of Large Language Models `_ - * `Neural Machine Translation (NMT) `_ - * `Punctuation and Capitalization `_ - * `Token classification (named entity recognition) `_ - * `Text classification `_ - * `Joint Intent and Slot Classification `_ - * `Question answering `_ - * `GLUE benchmark `_ - * `Information retrieval `_ - * `Entity Linking `_ - * `Dialogue State Tracking `_ - * `Prompt Learning `_ + * `Neural Machine Translation (NMT) `_ + * `Punctuation and Capitalization `_ + * `Token classification (named entity recognition) `_ + * `Text classification `_ + * `Joint Intent and Slot Classification `_ + * `Question answering `_ + * `GLUE benchmark `_ + * `Information retrieval `_ + * `Entity Linking `_ + * `Dialogue State Tracking `_ + * `Prompt Learning `_ * `NGC collection of pre-trained NLP models. `_ * `Synthetic Tabular Data Generation `_ -* `Speech synthesis (TTS) `_ +* `Speech synthesis (TTS) `_ * Spectrogram generation: Tacotron2, GlowTTS, TalkNet, FastPitch, FastSpeech2, Mixer-TTS, Mixer-TTS-X * Vocoders: WaveGlow, SqueezeWave, UniGlow, MelGAN, HiFiGAN, UnivNet * End-to-end speech generation: FastPitch_HifiGan_E2E, FastSpeech2_HifiGan_E2E, VITS * `NGC collection of pre-trained TTS models. `_ * `Tools `_ - * `Text Processing (text normalization and inverse text normalization) `_ - * `CTC-Segmentation tool `_ - * `Speech Data Explorer `_: a dash-based tool for interactive exploration of ASR/TTS datasets + * `Text Processing (text normalization and inverse text normalization) `_ + * `CTC-Segmentation tool `_ + * `Speech Data Explorer `_: a dash-based tool for interactive exploration of ASR/TTS datasets * `Speech Data Processor `_ @@ -139,10 +139,10 @@ Requirements Documentation ------------- -.. |main| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=main +.. |r1.19.0| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=r1.19.0 :alt: Documentation Status :scale: 100% - :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/ + :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/ .. |stable| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=stable :alt: Documentation Status @@ -152,7 +152,7 @@ Documentation +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+ | Version | Status | Description | +=========+=============+==========================================================================================================================================+ -| Latest | |main| | `Documentation of the latest (i.e. main) branch. `_ | +| Latest | |r1.19.0| | `Documentation of the latest (i.e. main) branch. `_ | +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+ | Stable | |stable| | `Documentation of the stable (i.e. most recent release) branch. `_ | +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/nemo/package_info.py b/nemo/package_info.py index 709159dd5..d77e30463 100644 --- a/nemo/package_info.py +++ b/nemo/package_info.py @@ -16,7 +16,7 @@ MAJOR = 1 MINOR = 19 PATCH = 0 -PRE_RELEASE = 'rc0' +PRE_RELEASE = '' # Use the following formatting: (major, minor, patch, pre-release) VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb index 50aa60260..193680f6d 100644 --- a/tutorials/00_NeMo_Primer.ipynb +++ b/tutorials/00_NeMo_Primer.ipynb @@ -14,7 +14,7 @@ "\n", "The toolkit comes with extendable collections of pre-built modules and ready-to-use models for automatic speech recognition (ASR), natural language processing (NLP) and text synthesis (TTS). Built for speed, NeMo can utilize NVIDIA's Tensor Cores and scale out training to multiple GPUs and multiple nodes.\n", "\n", - "For more information, please visit https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/#" + "For more information, please visit https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/#" ] }, { @@ -42,7 +42,7 @@ "!pip install text-unidecode\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "## Install TorchAudio\n", @@ -1146,7 +1146,7 @@ "\n", "NeMo constantly adds new models and new tasks to these examples, such that these examples serve as the basis to train and evaluate models from scratch with the provided config files.\n", "\n", - "NeMo Examples directory can be found here - https://github.com/NVIDIA/NeMo/tree/main/examples" + "NeMo Examples directory can be found here - https://github.com/NVIDIA/NeMo/tree/r1.19.0/examples" ] }, { @@ -1251,7 +1251,7 @@ "\n", "While the tutorials are a great example of the simplicity of NeMo, please note for the best performance when training on real datasets, we advice the use of the example scripts instead of the tutorial notebooks. \n", "\n", - "NeMo Tutorials directory can be found here - https://github.com/NVIDIA/NeMo/tree/main/tutorials" + "NeMo Tutorials directory can be found here - https://github.com/NVIDIA/NeMo/tree/r1.19.0/tutorials" ] } ], diff --git a/tutorials/01_NeMo_Models.ipynb b/tutorials/01_NeMo_Models.ipynb index 6f230e62c..2a65509bd 100644 --- a/tutorials/01_NeMo_Models.ipynb +++ b/tutorials/01_NeMo_Models.ipynb @@ -37,7 +37,7 @@ "!pip install text-unidecode\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "## Install TorchAudio\n", diff --git a/tutorials/02_NeMo_Adapters.ipynb b/tutorials/02_NeMo_Adapters.ipynb index 51a91a3c7..e6874d141 100644 --- a/tutorials/02_NeMo_Adapters.ipynb +++ b/tutorials/02_NeMo_Adapters.ipynb @@ -25,7 +25,7 @@ "!pip install text-unidecode\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "## Grab the config we'll use in this example\n", diff --git a/tutorials/AudioTranslationSample.ipynb b/tutorials/AudioTranslationSample.ipynb index e8fb33aba..ac79ca3b2 100644 --- a/tutorials/AudioTranslationSample.ipynb +++ b/tutorials/AudioTranslationSample.ipynb @@ -38,7 +38,7 @@ }, "outputs": [], "source": [ - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n" ] }, @@ -249,7 +249,7 @@ "* [Speech Synthesis](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Inference_ModelSelect.ipynb)\n", "\n", "\n", - "You can find scripts for training and fine-tuning ASR, NLP and TTS models [here](https://github.com/NVIDIA/NeMo/tree/main/examples). " + "You can find scripts for training and fine-tuning ASR, NLP and TTS models [here](https://github.com/NVIDIA/NeMo/tree/r1.19.0/examples). " ] } ], diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb index 1b951e7b9..da2e53fd9 100644 --- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb +++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb @@ -41,7 +41,7 @@ "!pip install text-unidecode\n", "\n", "### Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, diff --git a/tutorials/VoiceSwapSample.ipynb b/tutorials/VoiceSwapSample.ipynb index addf19f3b..ea8356981 100644 --- a/tutorials/VoiceSwapSample.ipynb +++ b/tutorials/VoiceSwapSample.ipynb @@ -39,7 +39,7 @@ }, "outputs": [], "source": [ - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n" ] }, @@ -283,7 +283,7 @@ "* [Speech Synthesis](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Inference_ModelSelect.ipynb)\n", "\n", "\n", - "You can find scripts for training and fine-tuning ASR, NLP and TTS models [here](https://github.com/NVIDIA/NeMo/tree/main/examples). " + "You can find scripts for training and fine-tuning ASR, NLP and TTS models [here](https://github.com/NVIDIA/NeMo/tree/r1.19.0/examples). " ] }, { diff --git a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb index b9c0db866..fac120e1b 100644 --- a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb +++ b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb @@ -40,7 +40,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "\"\"\"\n", @@ -71,7 +71,7 @@ "\n", "For this tutorial (and limited by the compute and storage available on Colab environments), we will attempt to fine-tune an English ASR model onto the [Mozilla Common Voice](https://commonvoice.mozilla.org/en) dataset for Japanese. This dataset will also allow us to discuss a few details for fine-tuning low-resource languages. The methods discussed here can also be applied to languages with several thousand hours of data!\n", "\n", - "**Note**: It is advised to review the execution flow diagram for ASR models in order to correctly setup the model prior to fine-tuning - [ASR CTC Examples](https://github.com/NVIDIA/NeMo/blob/main/examples/asr/asr_ctc/README.md)\n" + "**Note**: It is advised to review the execution flow diagram for ASR models in order to correctly setup the model prior to fine-tuning - [ASR CTC Examples](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/asr/asr_ctc/README.md)\n" ] }, { diff --git a/tutorials/asr/ASR_Example_CommonVoice_Finetuning.ipynb b/tutorials/asr/ASR_Example_CommonVoice_Finetuning.ipynb index 5293f8504..c0af01bd2 100644 --- a/tutorials/asr/ASR_Example_CommonVoice_Finetuning.ipynb +++ b/tutorials/asr/ASR_Example_CommonVoice_Finetuning.ipynb @@ -10,7 +10,7 @@ "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", "\n", "\n", - "Training an ASR model for a new language can be challenging, especially for low-resource languages (see [example](https://github.com/NVIDIA/NeMo/blob/main/docs/source/asr/examples/kinyarwanda_asr.rst) for Kinyarwanda CommonVoice ASR model).\n", + "Training an ASR model for a new language can be challenging, especially for low-resource languages (see [example](https://github.com/NVIDIA/NeMo/blob/r1.19.0/docs/source/asr/examples/kinyarwanda_asr.rst) for Kinyarwanda CommonVoice ASR model).\n", "\n", "This example describes all basic steps required to build ASR model for Esperanto:\n", "\n", @@ -160,7 +160,7 @@ "\n", "The tarred dataset allows storing the dataset as large *.tar files instead of small separate audio files. It may speed up the training and minimizes the load when data is moved from storage to GPU nodes.\n", "\n", - "The NeMo toolkit provides a [script]( https://github.com/NVIDIA/NeMo/blob/main/scripts/speech_recognition/convert_to_tarred_audio_dataset.py) to get tarred dataset.\n", + "The NeMo toolkit provides a [script]( https://github.com/NVIDIA/NeMo/blob/r1.19.0/scripts/speech_recognition/convert_to_tarred_audio_dataset.py) to get tarred dataset.\n", "\n", "```bash\n", "\n", @@ -207,11 +207,11 @@ "source": [ "## Training hyper-parameters\n", "\n", - "The training parameters are defined in the [config file](https://github.com/NVIDIA/NeMo/blob/main/examples/asr/conf/conformer/conformer_ctc_bpe.yaml) (general description of the [ASR configuration file](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/configs.html)). As an encoder, the [Conformer model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-ctc) is used here, the training parameters for which are already well configured based on the training English models. However, the set of optimal parameters may differ for a new language. In this section, we will look at the set of simple parameters that can improve recognition quality for a new language without digging into the details of the Conformer model too much.\n", + "The training parameters are defined in the [config file](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/asr/conf/conformer/conformer_ctc_bpe.yaml) (general description of the [ASR configuration file](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/configs.html)). As an encoder, the [Conformer model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-ctc) is used here, the training parameters for which are already well configured based on the training English models. However, the set of optimal parameters may differ for a new language. In this section, we will look at the set of simple parameters that can improve recognition quality for a new language without digging into the details of the Conformer model too much.\n", "\n", "### Select Training Batch Size\n", "\n", - "We trained model on server with 16 V100 GPUs with 32 GB. We use a local batch size = 32 per GPU V100), so global batch size is 32x16=512. In general, we observed, that global batch between 512 and 2048 works well for Conformer-CTC-Large model. One can use the [accumulate_grad_batches](https://github.com/NVIDIA/NeMo/blob/main/examples/asr/conf/conformer/conformer_ctc_bpe.yaml#L173) parameter to increase the size of the global batch, which is equal to *local_batch * num_gpu * accumulate_grad_batches*.\n", + "We trained model on server with 16 V100 GPUs with 32 GB. We use a local batch size = 32 per GPU V100), so global batch size is 32x16=512. In general, we observed, that global batch between 512 and 2048 works well for Conformer-CTC-Large model. One can use the [accumulate_grad_batches](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/asr/conf/conformer/conformer_ctc_bpe.yaml#L173) parameter to increase the size of the global batch, which is equal to *local_batch * num_gpu * accumulate_grad_batches*.\n", "\n", "### Selecting Optimizer and Learning Rate Scheduler\n", "\n", @@ -327,7 +327,7 @@ "+init_from_pretrained_model=${PRETRAINED_MODEL_NAME}\n", "```\n", "\n", - "If the size of the vocabulary differs from the one presented in the pretrained model, you need to change the vocabulary manually as done in the [finetuning tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb).\n", + "If the size of the vocabulary differs from the one presented in the pretrained model, you need to change the vocabulary manually as done in the [finetuning tutorial](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb).\n", "\n", "```python\n", "model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(f\"nvidia/{PRETRAINED_MODEL_NAME}\", map_location='cpu')\n", diff --git a/tutorials/asr/ASR_for_telephony_speech.ipynb b/tutorials/asr/ASR_for_telephony_speech.ipynb index 11ba4b85b..48be4b4db 100644 --- a/tutorials/asr/ASR_for_telephony_speech.ipynb +++ b/tutorials/asr/ASR_for_telephony_speech.ipynb @@ -28,7 +28,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "## Grab the config we'll use in this example\n", diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb index 0c0d239bf..c1f62a871 100644 --- a/tutorials/asr/ASR_with_NeMo.ipynb +++ b/tutorials/asr/ASR_with_NeMo.ipynb @@ -54,7 +54,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "\"\"\"\n", @@ -588,7 +588,7 @@ "\n", "if not os.path.exists(config_path):\n", " # Grab the config we'll use in this example\n", - " BRANCH = 'main'\n", + " BRANCH = 'r1.19.0'\n", " !mkdir configs\n", " !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml\n", "\n", diff --git a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb index b932916f2..cf4d8442f 100644 --- a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb +++ b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb @@ -41,7 +41,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "## Grab the config we'll use in this example\n", diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb index e6bccc3f0..7846a1468 100644 --- a/tutorials/asr/ASR_with_Transducers.ipynb +++ b/tutorials/asr/ASR_with_Transducers.ipynb @@ -29,7 +29,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "## Grab the config we'll use in this example\n", diff --git a/tutorials/asr/Buffered_Transducer_Inference.ipynb b/tutorials/asr/Buffered_Transducer_Inference.ipynb index c23398dca..bc1209a80 100644 --- a/tutorials/asr/Buffered_Transducer_Inference.ipynb +++ b/tutorials/asr/Buffered_Transducer_Inference.ipynb @@ -28,7 +28,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "# Update numba and restart (this is required to update internal numba version of Colab)\n", diff --git a/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb b/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb index 2f179eaa9..fad96a609 100644 --- a/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb +++ b/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb @@ -46,7 +46,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "# Update numba and restart (this is required to update internal numba version of Colab)\n", diff --git a/tutorials/asr/Intro_to_Transducers.ipynb b/tutorials/asr/Intro_to_Transducers.ipynb index d3928bed9..c82d7ed86 100644 --- a/tutorials/asr/Intro_to_Transducers.ipynb +++ b/tutorials/asr/Intro_to_Transducers.ipynb @@ -44,7 +44,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ], "execution_count": null, @@ -225,7 +225,7 @@ "id": "0W12xF_CqcVF" }, "source": [ - "![](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/images/transducer.png?raw=true)" + "![](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/asr/images/transducer.png?raw=true)" ] }, { diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb index a1edeea81..431dc515a 100644 --- a/tutorials/asr/Multilang_ASR.ipynb +++ b/tutorials/asr/Multilang_ASR.ipynb @@ -104,7 +104,7 @@ "\n", "## Install NeMo\n", "## We are using the main branch but you might want to adjust that too\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "\"\"\"\n", @@ -204,7 +204,7 @@ "outputs": [], "source": [ "if not os.path.exists(\"get_librispeech_data.py\"):\n", - " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/dataset_processing/get_librispeech_data.py" + " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/dataset_processing/get_librispeech_data.py" ] }, { @@ -296,7 +296,7 @@ "outputs": [], "source": [ "if not os.path.exists(\"get_commonvoice_data.py\"):\n", - " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/dataset_processing/get_commonvoice_data.py" + " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/dataset_processing/get_commonvoice_data.py" ] }, { @@ -800,7 +800,7 @@ "outputs": [], "source": [ "if not os.path.exists(\"process_asr_text_tokenizer.py\"):\n", - " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py" + " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/tokenizers/process_asr_text_tokenizer.py" ] }, { diff --git a/tutorials/asr/Offline_ASR.ipynb b/tutorials/asr/Offline_ASR.ipynb index fc8af2e76..685d3ef6f 100644 --- a/tutorials/asr/Offline_ASR.ipynb +++ b/tutorials/asr/Offline_ASR.ipynb @@ -30,7 +30,7 @@ "* use beam search decoder with N-gram language model re-scoring\n", "\n", "You may find more info on how to train and use language models for ASR models here:\n", - "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html\n", + "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/asr/asr_language_modeling.html\n", "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n" ] }, @@ -52,7 +52,7 @@ "id": "I9eIxAyKHREB" }, "source": [ - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "try:\n", " # Import NeMo Speech Recognition collection\n", " import nemo.collections.asr as nemo_asr\n", diff --git a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb index b38fab2c9..9d4f66b82 100644 --- a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb +++ b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb @@ -23,7 +23,7 @@ "!pip install wget\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "\"\"\"\n", diff --git a/tutorials/asr/Online_ASR_Microphone_Demo.ipynb b/tutorials/asr/Online_ASR_Microphone_Demo.ipynb index 31d2c0dec..6a1ac0bb1 100644 --- a/tutorials/asr/Online_ASR_Microphone_Demo.ipynb +++ b/tutorials/asr/Online_ASR_Microphone_Demo.ipynb @@ -27,7 +27,7 @@ "!pip install pyaudio\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "## Grab the config we'll use in this example\n", diff --git a/tutorials/asr/Online_Noise_Augmentation.ipynb b/tutorials/asr/Online_Noise_Augmentation.ipynb index f8741cdcb..b2fbf1a2b 100644 --- a/tutorials/asr/Online_Noise_Augmentation.ipynb +++ b/tutorials/asr/Online_Noise_Augmentation.ipynb @@ -32,7 +32,7 @@ "!pip install text-unidecode\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "## Install TorchAudio\n", diff --git a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb index 7a8dacd82..e642fd4f6 100644 --- a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb +++ b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb @@ -27,7 +27,7 @@ "!pip install pyaudio\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "## Install TorchAudio\n", @@ -67,7 +67,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook requires the `torchaudio` library to be installed for MarbleNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/main/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.\n" + "This notebook requires the `torchaudio` library to be installed for MarbleNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/r1.19.0/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.\n" ] }, { diff --git a/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb b/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb index c704ee114..23e31e5b0 100644 --- a/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb +++ b/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb @@ -29,7 +29,7 @@ "!pip install pyaudio\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "## Install TorchAudio\n", @@ -59,7 +59,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook requires the `torchaudio` library to be installed for MatchboxNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/main/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.\n" + "This notebook requires the `torchaudio` library to be installed for MatchboxNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/r1.19.0/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.\n" ] }, { diff --git a/tutorials/asr/Self_Supervised_Pre_Training.ipynb b/tutorials/asr/Self_Supervised_Pre_Training.ipynb index 04998f68f..fe47a62e2 100644 --- a/tutorials/asr/Self_Supervised_Pre_Training.ipynb +++ b/tutorials/asr/Self_Supervised_Pre_Training.ipynb @@ -28,7 +28,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "\"\"\"\n", @@ -51,7 +51,7 @@ "\n", "The approach we will use for pre-training our models is represented in the following diagram:\n", "\n", - " ![SSL diagram](https://raw.githubusercontent.com/NVIDIA/NeMo/main/tutorials/asr/images/contrastive_ssl.png)\n", + " ![SSL diagram](https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/tutorials/asr/images/contrastive_ssl.png)\n", "\n", "We first mask parts of our input using SpecAugment. The model is then trained to solve a contrastive task of distinguishing the latent representation of the masked time steps from several sampled distractors. Since our encoders also contain stride blocks which reduce the length of the inputs, in order to obtain target representations we combine several consecutive time steps. They are then passed through a quantizer, which has been found to help with contrastive pre-training." ] @@ -272,8 +272,8 @@ "source": [ "## Grab the configs we'll use in this example\n", "!mkdir configs\n", - "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml\n", - "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/citrinet/citrinet_1024.yaml\n" + "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml\n", + "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/asr/conf/citrinet/citrinet_1024.yaml\n" ] }, { @@ -482,7 +482,7 @@ "outputs": [], "source": [ "!mkdir scripts\n", - "!wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py\n", + "!wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/tokenizers/process_asr_text_tokenizer.py\n", "\n", "!python ./scripts/process_asr_text_tokenizer.py \\\n", " --manifest=\"{data_dir}/an4/train_manifest.json\" \\\n", diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb index 208752347..b26cba7da 100644 --- a/tutorials/asr/Speech_Commands.ipynb +++ b/tutorials/asr/Speech_Commands.ipynb @@ -61,7 +61,7 @@ "!pip install text-unidecode\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "## Install TorchAudio\n", diff --git a/tutorials/asr/Streaming_ASR.ipynb b/tutorials/asr/Streaming_ASR.ipynb index a4701dc02..d90bf3633 100644 --- a/tutorials/asr/Streaming_ASR.ipynb +++ b/tutorials/asr/Streaming_ASR.ipynb @@ -28,7 +28,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "## Grab the config we'll use in this example\n", @@ -62,7 +62,7 @@ "* Real-time or close to real-time inference for live transcriptions\n", "* Offline transcriptions of very long audio\n", "\n", - "In this tutorial, we will mainly focus on streaming for handling long form audio and close to real-time inference with CTC based models. For training ASR models we usually use short segments of audio (<20s) that may be smaller chunks of a long audio that is aligned with the transcriptions and segmented into smaller chunks (see [tools/](https://github.com/NVIDIA/NeMo/tree/main/tools) for some great tools to do this). For running inference on long audio files we are restricted by the available GPU memory that dictates the maximum length of audio that can be transcribed in one inference call. We will take a look at one of the ways to overcome this restriction using NeMo's Conformer-CTC ASR model." + "In this tutorial, we will mainly focus on streaming for handling long form audio and close to real-time inference with CTC based models. For training ASR models we usually use short segments of audio (<20s) that may be smaller chunks of a long audio that is aligned with the transcriptions and segmented into smaller chunks (see [tools/](https://github.com/NVIDIA/NeMo/tree/r1.19.0/tools) for some great tools to do this). For running inference on long audio files we are restricted by the available GPU memory that dictates the maximum length of audio that can be transcribed in one inference call. We will take a look at one of the ways to overcome this restriction using NeMo's Conformer-CTC ASR model." ] }, { diff --git a/tutorials/asr/Voice_Activity_Detection.ipynb b/tutorials/asr/Voice_Activity_Detection.ipynb index b8013822c..b4c7e33f7 100644 --- a/tutorials/asr/Voice_Activity_Detection.ipynb +++ b/tutorials/asr/Voice_Activity_Detection.ipynb @@ -28,7 +28,7 @@ "!pip install text-unidecode\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "## Install TorchAudio\n", diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb index 62481c376..80cf4ecac 100644 --- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb +++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb @@ -50,7 +50,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "## Grab the config we'll use in this example\n", @@ -1297,7 +1297,7 @@ "source": [ "# Further reading\n", "\n", - "For efficient scripts to add, train, and evaluate adapter augmented models, please refer to the [Adapters example section](https://github.com/NVIDIA/NeMo/tree/main/examples/asr/asr_adapters).\n", + "For efficient scripts to add, train, and evaluate adapter augmented models, please refer to the [Adapters example section](https://github.com/NVIDIA/NeMo/tree/r1.19.0/examples/asr/asr_adapters).\n", "\n", "Please follow the following articles that discuss the use of adapters in ASR - \n", "- [Exploiting Adapters for Cross-lingual Low-resource Speech Recognition](https://arxiv.org/abs/2105.11905)\n", diff --git a/tutorials/cloud/aws/ASR_Finetuning_at_Scale_with_AWS_SageMaker.ipynb b/tutorials/cloud/aws/ASR_Finetuning_at_Scale_with_AWS_SageMaker.ipynb index c4406a4f0..97697781d 100644 --- a/tutorials/cloud/aws/ASR_Finetuning_at_Scale_with_AWS_SageMaker.ipynb +++ b/tutorials/cloud/aws/ASR_Finetuning_at_Scale_with_AWS_SageMaker.ipynb @@ -70,7 +70,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "\"\"\"\n", @@ -193,17 +193,17 @@ "config_path = str(config_dir / \"config.yaml\")\n", "\n", "# download scripts to format the data source.\n", - "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py\", str(code_dir))\n", - "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_to_tarred_audio_dataset.py\",\n", + "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/speech_recognition/convert_hf_dataset_to_nemo.py\", str(code_dir))\n", + "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/speech_recognition/convert_to_tarred_audio_dataset.py\",\n", " str(code_dir))\n", "\n", "# download scripts to run training\n", - "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/conformer/conformer_ctc_bpe.yaml\", config_path)\n", - "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py\",\n", + "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/asr/conf/conformer/conformer_ctc_bpe.yaml\", config_path)\n", + "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py\",\n", " str(code_dir))\n", "\n", "# download script to create tokenizer\n", - "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py\",\n", + "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/tokenizers/process_asr_text_tokenizer.py\",\n", " str(code_dir))" ] }, diff --git a/tutorials/cloud/aws/SageMaker_ASR_Training.ipynb b/tutorials/cloud/aws/SageMaker_ASR_Training.ipynb index 8cf540b27..078e76d55 100644 --- a/tutorials/cloud/aws/SageMaker_ASR_Training.ipynb +++ b/tutorials/cloud/aws/SageMaker_ASR_Training.ipynb @@ -55,7 +55,7 @@ "!pip install matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "\"\"\"\n", @@ -173,8 +173,8 @@ "outputs": [], "source": [ "config_path = str(config_dir / \"config.yaml\")\n", - "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/conformer/conformer_ctc_char.yaml\", config_path)\n", - "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/asr_ctc/speech_to_text_ctc.py\", str(code_dir))" + "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/asr/conf/conformer/conformer_ctc_char.yaml\", config_path)\n", + "wget.download(\"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/asr/asr_ctc/speech_to_text_ctc.py\", str(code_dir))" ] }, { diff --git a/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb b/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb index faa93de12..c18ebbac5 100644 --- a/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb +++ b/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb @@ -26,7 +26,7 @@ "# If you're using Google Colab and not running locally, run this cell\n", "\n", "# install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" ] }, @@ -152,7 +152,7 @@ "id": "jEgEo0aPj3Ws" }, "source": [ - "All NeMo [NLP models](https://github.com/NVIDIA/NeMo/tree/main/examples/nlp) have an associated config file. As an example, let's examine the config file for the Named Entity Recognition (NER) model (more details about the model and the NER task could be found [here](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb))." + "All NeMo [NLP models](https://github.com/NVIDIA/NeMo/tree/r1.19.0/examples/nlp) have an associated config file. As an example, let's examine the config file for the Named Entity Recognition (NER) model (more details about the model and the NER task could be found [here](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb))." ] }, { @@ -261,7 +261,7 @@ "id": "EVp4zvxPatga" }, "source": [ - "and then start the training as usual (please see [tutorials/nlp](https://github.com/NVIDIA/NeMo/tree/main/tutorials/nlp) for more details about training of a particular model). \n", + "and then start the training as usual (please see [tutorials/nlp](https://github.com/NVIDIA/NeMo/tree/r1.19.0/tutorials/nlp) for more details about training of a particular model). \n", "\n", "You can also provide a pretrained language model checkpoint and a configuration file if available.\n", "\n", @@ -349,7 +349,7 @@ "model.language_model.lm_checkpoint= \\\n", "model.language_model.config_file=`\n", "\n", - "The general Megatron-LM model names are used to download the correct vocabulary file needed to setup the model correctly. Note, the data preprocessing and model training is done in NeMo. Megatron-LM has its own set of training arguments (including tokenizer) that are ignored during finetuning in NeMo. Please see downstream task [config files and training scripts](https://github.com/NVIDIA/NeMo/tree/main/examples/nlp) for all NeMo supported arguments.\n", + "The general Megatron-LM model names are used to download the correct vocabulary file needed to setup the model correctly. Note, the data preprocessing and model training is done in NeMo. Megatron-LM has its own set of training arguments (including tokenizer) that are ignored during finetuning in NeMo. Please see downstream task [config files and training scripts](https://github.com/NVIDIA/NeMo/tree/r1.19.0/examples/nlp) for all NeMo supported arguments.\n", "\n", "## Download pretrained model\n", "\n", @@ -373,7 +373,7 @@ "source": [ "# Using any HuggingFace Pretrained Model\n", "\n", - "Currently, there are 4 HuggingFace language models that have the most extensive support in [NeMo](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/nlp/modules/common/huggingface): \n", + "Currently, there are 4 HuggingFace language models that have the most extensive support in [NeMo](https://github.com/NVIDIA/NeMo/tree/r1.19.0/nemo/collections/nlp/modules/common/huggingface): \n", "\n", "* BERT\n", "* RoBERTa\n", @@ -383,7 +383,7 @@ "As was mentioned before, just set `model.language_model.pretrained_model_name` to the desired model name in your config and get_lm_model() will take care of the rest.\n", "\n", "If you want to use another language model from [https://huggingface.co/models](https://huggingface.co/models), use HuggingFace API directly in NeMo.\n", - "More details on model training could be found at [tutorials](https://github.com/NVIDIA/NeMo/tree/main/tutorials)." + "More details on model training could be found at [tutorials](https://github.com/NVIDIA/NeMo/tree/r1.19.0/tutorials)." ] } ], diff --git a/tutorials/nlp/02_NLP_Tokenizers.ipynb b/tutorials/nlp/02_NLP_Tokenizers.ipynb index c63d2a8b1..5c909fe73 100644 --- a/tutorials/nlp/02_NLP_Tokenizers.ipynb +++ b/tutorials/nlp/02_NLP_Tokenizers.ipynb @@ -10,7 +10,7 @@ }, "outputs": [], "source": [ - "BRANCH = 'main'" + "BRANCH = 'r1.19.0'" ] }, { @@ -35,7 +35,7 @@ "# If you're using Google Colab and not running locally, run this cell\n", "\n", "# install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" ] }, diff --git a/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb b/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb index 323bfa1c4..28d5330ac 100644 --- a/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb +++ b/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb @@ -217,7 +217,7 @@ "print()\n", "\n", "\n", - "!wget https://raw.github.com/NVIDIA/NeMo/main/scripts/neural_machine_translation/filter_langs_nmt.py \\\n", + "!wget https://raw.github.com/NVIDIA/NeMo/r1.19.0/scripts/neural_machine_translation/filter_langs_nmt.py \\\n", " -O filter_langs_nmt.py\n", "\n", "!python filter_langs_nmt.py \\\n", @@ -300,7 +300,7 @@ "\n", "## Install NeMo\n", "\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", "\n", "!pip uninstall -y sacrebleu\n", @@ -760,7 +760,7 @@ "metadata": {}, "outputs": [], "source": [ - "!wget https://raw.github.com/NVIDIA/NeMo/main/examples/nlp/machine_translation/create_tarred_parallel_dataset.py \\\n", + "!wget https://raw.github.com/NVIDIA/NeMo/r1.19.0/examples/nlp/machine_translation/create_tarred_parallel_dataset.py \\\n", " -O create_tarred_parallel_dataset.py\n", "\n", "!python create_tarred_parallel_dataset.py \\\n", diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb index ddd3bdd4f..cf0392da9 100644 --- a/tutorials/nlp/Dialogue.ipynb +++ b/tutorials/nlp/Dialogue.ipynb @@ -27,7 +27,7 @@ "outputs": [], "source": [ "import os \n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n", "!git clone https://github.com/NVIDIA/NeMo --branch $BRANCH\n", "os.chdir('NeMo')\n", diff --git a/tutorials/nlp/Entity_Linking_Medical.ipynb b/tutorials/nlp/Entity_Linking_Medical.ipynb index ff8eda123..892eb881b 100644 --- a/tutorials/nlp/Entity_Linking_Medical.ipynb +++ b/tutorials/nlp/Entity_Linking_Medical.ipynb @@ -17,7 +17,7 @@ "\"\"\"\n", "\n", "## Install NeMo if using google collab or if its not installed locally\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, @@ -68,7 +68,7 @@ "#### Task Description\n", "[Entity linking](https://en.wikipedia.org/wiki/Entity_linking) is the process of connecting concepts mentioned in natural language to their canonical forms stored in a knowledge base. For example, say a knowledge base contained the entity 'ID3452 influenza' and we wanted to process some natural language containing the sentence \"The patient has flu like symptoms\". An entity linking model would match the word 'flu' to the knowledge base entity 'ID3452 influenza', allowing for disambiguation and normalization of concepts referenced in text. Entity linking applications range from helping automate data ingestion to assisting in real time dialogue concept normalization. We will be focusing on entity linking in the medical domain for this demo, but the entity linking model, dataset, and training code within NVIDIA NeMo can be applied to other domains like finance and retail.\n", "\n", - "Within NeMo and this tutorial we use the entity linking approach described in Liu et. al's NAACL 2021 \"[Self-alignment Pre-training for Biomedical Entity Representations](https://arxiv.org/abs/2010.11784v2)\". The main idea behind this approach is to reshape an initial concept embedding space such that synonyms of the same concept are pulled closer together and unrelated concepts are pushed further apart. The concept embeddings from this reshaped space can then be used to build a knowledge base embedding index. This index stores concept IDs mapped to their respective concept embeddings in a format conducive to efficient nearest neighbor search. We can link query concepts to their canonical forms in the knowledge base by performing a nearest neighbor search- matching concept query embeddings to the most similar concepts embeddings in the knowledge base index. \n", + "Within NeMo and this tutorial we use the entity linking approach described in Liu et. al's NAACL 2021 \"[Self-alignment Pre-training for Biomedical Entity Representations](https://arxiv.org/abs/2010.11784v2)\". The r1.19.0 idea behind this approach is to reshape an initial concept embedding space such that synonyms of the same concept are pulled closer together and unrelated concepts are pushed further apart. The concept embeddings from this reshaped space can then be used to build a knowledge base embedding index. This index stores concept IDs mapped to their respective concept embeddings in a format conducive to efficient nearest neighbor search. We can link query concepts to their canonical forms in the knowledge base by performing a nearest neighbor search- matching concept query embeddings to the most similar concepts embeddings in the knowledge base index. \n", "\n", "In this tutorial we will be using the [faiss](https://github.com/facebookresearch/faiss) library to build our concept index." ] diff --git a/tutorials/nlp/GLUE_Benchmark.ipynb b/tutorials/nlp/GLUE_Benchmark.ipynb index d8fe75940..1c60b95bc 100644 --- a/tutorials/nlp/GLUE_Benchmark.ipynb +++ b/tutorials/nlp/GLUE_Benchmark.ipynb @@ -44,7 +44,7 @@ "# If you're using Google Colab and not running locally, run this cell\n", "\n", "# install NeMo\n", - "BRANCH = 'main'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n" + "BRANCH = 'r1.19.0'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n" ], "execution_count": null, "outputs": [] diff --git a/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb b/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb index 6204bf251..50ec879b7 100644 --- a/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb +++ b/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb @@ -21,7 +21,7 @@ "import os\n", "\n", "# install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "\n", "GITHUB_ACCOUNT = 'NVIDIA' # change this if using a fork\n", "\n", @@ -284,7 +284,7 @@ "id": "miXYxOv_mNVo" }, "source": [ - "The script [prepare_corpora_for_alignment.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py) prepares the described parallel corpora. It extracts all unique ITN phrase-pairs from the Google TN dataset, tokenizes them as described above and stores in separate folders for each semiotic class. It also generates a bash script for running the alignment. At the end it prints how many examples it has found:\n", + "The script [prepare_corpora_for_alignment.py](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py) prepares the described parallel corpora. It extracts all unique ITN phrase-pairs from the Google TN dataset, tokenizes them as described above and stores in separate folders for each semiotic class. It also generates a bash script for running the alignment. At the end it prints how many examples it has found:\n", "```\n", "content/alignment/punct has 920953 instances\n", "content/alignment/date has 150499 instances\n", @@ -405,7 +405,7 @@ { "cell_type": "markdown", "source": [ - "GIZA++ will generate many files in our class folders, but we need only two files with final alignments, those with suffixes `A3.final`. The two files correspond to the alignments produced by two GIZA++ runs - direct and reverse (switching source and target corpus). This is a common practice, it allows us to find safer alignment points - tokens that were aligned to one another in both runs. The script [extract_giza_alignments.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py) heuristically combines these two GIZA++ alignments. It also applies a bunch of regular expressions to correct some alignment mistakes." + "GIZA++ will generate many files in our class folders, but we need only two files with final alignments, those with suffixes `A3.final`. The two files correspond to the alignments produced by two GIZA++ runs - direct and reverse (switching source and target corpus). This is a common practice, it allows us to find safer alignment points - tokens that were aligned to one another in both runs. The script [extract_giza_alignments.py](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py) heuristically combines these two GIZA++ alignments. It also applies a bunch of regular expressions to correct some alignment mistakes." ], "metadata": { "id": "ueJYVF0cU3ic" @@ -1016,11 +1016,11 @@ "\n", "See also the scripts for the whole pipeline:\n", "\n", - "> [prepare_dataset_en.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/prepare_dataset_en.sh)\n", + "> [prepare_dataset_en.sh](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/text_normalization_as_tagging/prepare_dataset_en.sh)\n", "\n", - "> [normalization_as_tagging_train.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_train.py)\n", + "> [normalization_as_tagging_train.py](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_train.py)\n", "\n", - "> [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/run_infer.sh)\n", + "> [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/text_normalization_as_tagging/run_infer.sh)\n", "\n" ], "metadata": { diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb index 104d69df1..c656fdd70 100644 --- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb +++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb @@ -22,7 +22,7 @@ "# If you're using Google Colab and not running locally, run this cell\n", "\n", "# install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" ] }, diff --git a/tutorials/nlp/MegatronBert_export.ipynb b/tutorials/nlp/MegatronBert_export.ipynb index f925d2bc5..5bec75028 100644 --- a/tutorials/nlp/MegatronBert_export.ipynb +++ b/tutorials/nlp/MegatronBert_export.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "BRANCH='main'" + "BRANCH='r1.19.0'" ] }, { @@ -64,7 +64,7 @@ "\n", "If you prefer to use the Huggingface BERT models, please skip this section and refer to `Setting up a NeMo Experiment` section to load a model from `nemo_nlp.modules.get_pretrained_lm_models_list()`\n", "\n", - "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file." + "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file." ] }, { diff --git a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb index e1aa32f7b..a92317b17 100644 --- a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb +++ b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb @@ -62,7 +62,7 @@ "metadata": {}, "outputs": [], "source": [ - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "DATA_PATH='.'\n", "TRANSACTIONS=DATA_PATH+'/card_transaction.v1.csv'\n", "#CHECKPOINTS='/chk_points'\n", diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb index 004014ebd..02d533e59 100644 --- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb +++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "BRANCH='main'" + "BRANCH='r1.19.0'" ] }, { @@ -45,7 +45,7 @@ "\n", "- Our p-tuning implementation is based off Liu et al's paper [GPT Understands, Too](https://arxiv.org/abs/2103.10385).\n", "\n", - "- Command line usage examples and API documentation can be found in [our user docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/prompt_learning.html). \n", + "- Command line usage examples and API documentation can be found in [our user docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/nlp/nemo_megatron/prompt_learning.html). \n", "\n", "\"Prompt\n", "\n", @@ -88,7 +88,7 @@ "# The Best of Both\n", "A single pretrained GPT model can use both p-tuning and prompt-tuning. While you must decide to use either p-tuning or prompt-tuning for each task you want your model to perform, you can p-tune your model on a set of tasks A, then prompt tune your same model on a different set of tasks B, then finally run inference on tasks from both A and B at the same time. During prompt-tuning or p-tuning, tasks tuned at the same time must use the same number of virtual tokens. During inference, tasks using differing amounts of virtual tokens can be run at the same time.\n", "\n", - "Please see our [docs for more comparisons between prompt and p-tuning](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/prompt_learning.html). \n", + "Please see our [docs for more comparisons between prompt and p-tuning](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/nlp/nemo_megatron/prompt_learning.html). \n", "\n", "With all that covered, let's get started!\n" ] @@ -723,7 +723,7 @@ "- `length_params`\n", "- `sampling_params`\n", "\n", - "as arguments. More information about the [text generation API can be found here](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/transformer/text_generation.py).\n", + "as arguments. More information about the [text generation API can be found here](https://github.com/NVIDIA/NeMo/blob/r1.19.0/nemo/collections/nlp/modules/common/transformer/text_generation.py).\n", "\n", "If `length_params` and `sampling_params` are set to `None`, the model generates output with a greedy decoding strategy and generates up to `30` new tokens. Most predictive downstream tasks (not text generation tasks), use greedy sampling. To see other ways to run inference with your prompt learning model and more details on how to define various inference parameters, visit `examples/nlp/language_modeling/megatron_gpt_eval.py`.\n", "\n", diff --git a/tutorials/nlp/Punctuation_and_Capitalization.ipynb b/tutorials/nlp/Punctuation_and_Capitalization.ipynb index 1519c2343..54e424c83 100644 --- a/tutorials/nlp/Punctuation_and_Capitalization.ipynb +++ b/tutorials/nlp/Punctuation_and_Capitalization.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "BRANCH = 'main'" + "BRANCH = 'r1.19.0'" ] }, { @@ -293,7 +293,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you see, `get_tatoeba_data.py` script provides not only downloads Tatoeba but also creates labels. If you wish to preprocess your own data, use [examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py) script:\n", + "As you see, `get_tatoeba_data.py` script provides not only downloads Tatoeba but also creates labels. If you wish to preprocess your own data, use [examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py) script:\n", "\n", "```\n", "NEMO_ROOT = \"\"\n", @@ -421,7 +421,7 @@ "\n", "- **trainer**: Any argument to be passed to PyTorch Lightning\n", "\n", - "See [docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#training-punctuation-and-capitalization-model) for full config description." + "See [docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/nlp/punctuation_and_capitalization.html#training-punctuation-and-capitalization-model) for full config description." ] }, { @@ -950,7 +950,7 @@ "source": [ "## Training Script\n", "\n", - "If you have NeMo installed locally, you can also train the model with [nlp/token_classification/punctuation_capitalization_train_evaluate.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py).\n", + "If you have NeMo installed locally, you can also train the model with [nlp/token_classification/punctuation_capitalization_train_evaluate.py](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py).\n", "\n", "To run training script, use:\n", "\n", diff --git a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb index 5580bc4cf..3ce2ef9fb 100644 --- a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb +++ b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb @@ -10,7 +10,7 @@ }, "outputs": [], "source": [ - "BRANCH = 'main'" + "BRANCH = 'r1.19.0'" ] }, { @@ -369,7 +369,7 @@ } }, "source": [ - "As you see, `get_libritts_data.py` script provides not only downloads LibriTTS but also creates labels. If you wish to preprocess your own data, use [examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py) script." + "As you see, `get_libritts_data.py` script provides not only downloads LibriTTS but also creates labels. If you wish to preprocess your own data, use [examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py) script." ] }, { @@ -514,7 +514,7 @@ "\n", "- **trainer**: Any argument to be passed to PyTorch Lightning\n", "\n", - "See [docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#training-punctuation-and-capitalization-model) for full config description." + "See [docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/nlp/punctuation_and_capitalization.html#training-punctuation-and-capitalization-model) for full config description." ] }, { @@ -913,7 +913,7 @@ "source": [ "## Training Script\n", "\n", - "If you have NeMo installed locally, you can also train the model with [nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py).\n", + "If you have NeMo installed locally, you can also train the model with [nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py).\n", "\n", "To run training script, use:\n", "\n", diff --git a/tutorials/nlp/Question_Answering.ipynb b/tutorials/nlp/Question_Answering.ipynb index 7217b0fb6..b337c5694 100644 --- a/tutorials/nlp/Question_Answering.ipynb +++ b/tutorials/nlp/Question_Answering.ipynb @@ -74,7 +74,7 @@ }, "outputs": [], "source": [ - "BRANCH = 'main'" + "BRANCH = 'r1.19.0'" ] }, { diff --git a/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb b/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb index b7c25cb41..0cd718e71 100644 --- a/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb +++ b/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "BRANCH = 'main'" + "BRANCH = 'r1.19.0'" ] }, { diff --git a/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb b/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb index 5b5b74e7b..3296acd05 100644 --- a/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb +++ b/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb @@ -20,7 +20,7 @@ "# If you're using Google Colab and not running locally, run this cell\n", "\n", "# install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n", "\n" ] diff --git a/tutorials/nlp/Token_Classification-BioMegatron.ipynb b/tutorials/nlp/Token_Classification-BioMegatron.ipynb index 517f2e557..c3f95bff8 100644 --- a/tutorials/nlp/Token_Classification-BioMegatron.ipynb +++ b/tutorials/nlp/Token_Classification-BioMegatron.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "BRANCH='main'" + "BRANCH='r1.19.0'" ] }, { diff --git a/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb b/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb index c3f7e28b6..9b8007751 100644 --- a/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb +++ b/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb @@ -30,7 +30,7 @@ "metadata": {}, "outputs": [], "source": [ - "BRANCH = 'main'" + "BRANCH = 'r1.19.0'" ] }, { @@ -53,7 +53,7 @@ "# If you're using Google Colab and not running locally, run this cell\n", "\n", "# install NeMo\n", - "BRANCH = 'main'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n" + "BRANCH = 'r1.19.0'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n" ], "execution_count": null, "outputs": [] diff --git a/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb b/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb index 69df7b27b..9ec4482f3 100644 --- a/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb +++ b/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb @@ -22,7 +22,7 @@ "# If you're using Google Colab and not running locally, run this cell\n", "\n", "# install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" ] }, @@ -630,7 +630,7 @@ "source": [ "## Training Script\n", "\n", - "If you have NeMo installed locally, you can also train the model with [examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py](https://github.com/carolmanderson/NeMo/blob/main/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py).\n", + "If you have NeMo installed locally, you can also train the model with [examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py](https://github.com/carolmanderson/NeMo/blob/r1.19.0/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py).\n", "\n", "To run training script, use:\n", "\n", diff --git a/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb b/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb index ea943b35e..0c0b81636 100644 --- a/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb +++ b/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb @@ -30,7 +30,7 @@ "!pip install text-unidecode\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "## Install TorchAudio\n", @@ -58,7 +58,7 @@ "For detailed parameter setting and execution of speaker diarization, refer to this [Diarization Inference](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb) tutorial.\n", "\n", "\n", - "An example script that runs ASR and speaker diarization together can be found at [ASR with Diarization](https://github.com/NVIDIA/NeMo/blob/main/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py).\n", + "An example script that runs ASR and speaker diarization together can be found at [ASR with Diarization](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py).\n", "\n", "### Speaker diarization in ASR pipeline\n", "\n", @@ -196,7 +196,7 @@ "DOMAIN_TYPE = \"meeting\" # Can be meeting or telephonic based on domain type of the audio file\n", "CONFIG_FILE_NAME = f\"diar_infer_{DOMAIN_TYPE}.yaml\"\n", "\n", - "CONFIG_URL = f\"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}\"\n", + "CONFIG_URL = f\"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}\"\n", "\n", "if not os.path.exists(os.path.join(data_dir,CONFIG_FILE_NAME)):\n", " CONFIG = wget.download(CONFIG_URL, data_dir)\n", diff --git a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb index 1fd0f1b14..93ff3ed97 100644 --- a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb +++ b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb @@ -23,7 +23,7 @@ "!pip install text-unidecode\n", "\n", "# ## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "## Install TorchAudio\n", @@ -62,9 +62,9 @@ "* **with oracle VAD**: use ground-truth speech/non-speech labels. \n", "* **with system VAD**: use speech/non-speech labels generated by an actual VAD model. \n", "\n", - "We will first demonstrate how to perform diarization with a oracle VAD timestamps (we assume we already have speech timestamps) and pretrained speaker embedding extractor model which can be found in tutorial for [Speaker Identification and Verification in NeMo](https://github.com/NVIDIA/NeMo/blob/main/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb).\n", + "We will first demonstrate how to perform diarization with a oracle VAD timestamps (we assume we already have speech timestamps) and pretrained speaker embedding extractor model which can be found in tutorial for [Speaker Identification and Verification in NeMo](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb).\n", "\n", - "In the following section, we will also show how to perform VAD and then diarization if ground truth timestamp speech were not available (non-oracle VAD). We also have tutorials for [VAD training in NeMo](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Voice_Activity_Detection.ipynb) and [online offline microphone inference](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb), where you can custom your model and training/finetuning on your own data.\n", + "In the following section, we will also show how to perform VAD and then diarization if ground truth timestamp speech were not available (non-oracle VAD). We also have tutorials for [VAD training in NeMo](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/asr/Voice_Activity_Detection.ipynb) and [online offline microphone inference](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb), where you can custom your model and training/finetuning on your own data.\n", "\n", "For demonstration purposes we would be using simulated audio from [an4 dataset](http://www.speech.cs.cmu.edu/databases/an4/)." ] @@ -140,7 +140,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " We use a default multi-scale setting in [diar_infer_telephonic.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml) which has 5 scales from 1.5 s to 0.5 s, 50% overlap and equal weights. Note that only the ratio between numbers in `multiscale_weights` since the fused affinity matrix is normalized. For example, \\[1,1,1,1,1\\] and \\[0.5,0.5,0.5,0.5,0.5\\] will lead to the exactly same result." + " We use a default multi-scale setting in [diar_infer_telephonic.yaml](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml) which has 5 scales from 1.5 s to 0.5 s, 50% overlap and equal weights. Note that only the ratio between numbers in `multiscale_weights` since the fused affinity matrix is normalized. For example, \\[1,1,1,1,1\\] and \\[0.5,0.5,0.5,0.5,0.5\\] will lead to the exactly same result." ] }, { @@ -191,7 +191,7 @@ "MSDD models employ pairwise (two-speaker) unit-model for both training and inference. While training, pairwise model is trained on data samples with two speakers or two-speaker subset from data samples with more than two speakers. \n", "In inference mode, we retrieve all possible pairs from the estimated number of speakers and average the results. For example, if there are four speakers `(A, B, C, D)`, we extract 6 pairs: `(A,B)`, `(A,C)`, `(A,D)`, `(B,C)`, `(B,D)`, `(C,D)`. Finally, the sigmoid outputs are averaged. In this way, MSDD can deal with flexible number of speakers using a pairwise model. \n", "\n", - "The detailed information on MSDD model and model training can be found in tutorial on [Speaker Diarization Training](https://github.com/NVIDIA/NeMo/blob/main/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb). " + "The detailed information on MSDD model and model training can be found in tutorial on [Speaker Diarization Training](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb). " ] }, { @@ -399,7 +399,7 @@ "from omegaconf import OmegaConf\n", "MODEL_CONFIG = os.path.join(data_dir,'diar_infer_telephonic.yaml')\n", "if not os.path.exists(MODEL_CONFIG):\n", - " config_url = \"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml\"\n", + " config_url = \"https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml\"\n", " MODEL_CONFIG = wget.download(config_url,data_dir)\n", "\n", "config = OmegaConf.load(MODEL_CONFIG)\n", diff --git a/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb b/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb index 3c56df2bb..ab5cab58b 100644 --- a/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb +++ b/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb @@ -18,7 +18,7 @@ "\"\"\"\n", "\n", "NEMO_DIR_PATH = \"NeMo\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "\n", "! git clone https://github.com/NVIDIA/NeMo\n", "%cd NeMo\n", @@ -197,9 +197,9 @@ "\n", "- Please skip this section and go directly to [Prepare Training data for MSDD](#Prepare-Training-data-for-MSDD) section if you have your own speaker diarization dataset. \n", "\n", - "In this tutorial, we use [NeMo Multispeaker Simulator](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tools/Multispeaker_Simulator.ipynb) and the Librispeech corpus to generate a toy training dataset for demonstration purpose. You can replace the simulated dataset with your own datasets if you have proper speaker annotations (RTTM files) for the dataset. If you do not have access to any speaker diarization datasets, you can use [NeMo Multispeaker Simulator](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tools/Multispeaker_Simulator.ipynb) by generating a good amount of data samples to meet your needs. \n", + "In this tutorial, we use [NeMo Multispeaker Simulator](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tools/Multispeaker_Simulator.ipynb) and the Librispeech corpus to generate a toy training dataset for demonstration purpose. You can replace the simulated dataset with your own datasets if you have proper speaker annotations (RTTM files) for the dataset. If you do not have access to any speaker diarization datasets, you can use [NeMo Multispeaker Simulator](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tools/Multispeaker_Simulator.ipynb) by generating a good amount of data samples to meet your needs. \n", "\n", - "For more details regarding data simulator, please follow the descriptions in [NeMo Multispeaker Simulator](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tools/Multispeaker_Simulator.ipynb) and we will not cover configurations and detailed process of data simulation in this tutorial. \n" + "For more details regarding data simulator, please follow the descriptions in [NeMo Multispeaker Simulator](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tools/Multispeaker_Simulator.ipynb) and we will not cover configurations and detailed process of data simulation in this tutorial. \n" ] }, { @@ -232,7 +232,7 @@ "source": [ "import os\n", "NEMO_DIR_PATH = \"NeMo\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "\n", "# download scripts if not already there \n", "if not os.path.exists('NeMo/scripts'):\n", diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb index 8e3ae9c1f..f956334b8 100644 --- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb +++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb @@ -27,7 +27,7 @@ "!pip install text-unidecode\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n", "\n", "# Install TorchAudio\n", @@ -58,7 +58,7 @@ "source": [ "In this tutorial, we shall first train these embeddings on speaker-related datasets, and then get speaker embeddings from a pretrained network for a new dataset. Since Google Colab has very slow read-write speeds, I'll be demonstrating this tutorial using [an4](http://www.speech.cs.cmu.edu/databases/an4/). \n", "\n", - "Instead, if you'd like to try on a bigger dataset like [hi-mia](https://arxiv.org/abs/1912.01231) use the [get_hi-mia-data.py](https://github.com/NVIDIA/NeMo/tree/main/scripts/dataset_processing/speaker_tasks/get_hi-mia_data.py) script to download the necessary files, extract them, and resample to 16Khz if any of these samples are not at 16Khz. " + "Instead, if you'd like to try on a bigger dataset like [hi-mia](https://arxiv.org/abs/1912.01231) use the [get_hi-mia-data.py](https://github.com/NVIDIA/NeMo/tree/r1.19.0/scripts/dataset_processing/speaker_tasks/get_hi-mia_data.py) script to download the necessary files, extract them, and resample to 16Khz if any of these samples are not at 16Khz. " ] }, { @@ -276,7 +276,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note: All the following steps are just for explanation of each section, but one can use the provided [training script](https://github.com/NVIDIA/NeMo/blob/main/examples/speaker_tasks/recognition/speaker_reco.py) to launch training in the command line." + "Note: All the following steps are just for explanation of each section, but one can use the provided [training script](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/speaker_tasks/recognition/speaker_reco.py) to launch training in the command line." ] }, { @@ -760,7 +760,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note: You may use [finetune-script](https://github.com/NVIDIA/NeMo/blob/main/examples/speaker_tasks/recognition/speaker_reco_finetune.py) to launch training in the command line. Following is just a demonstration of the script" + "Note: You may use [finetune-script](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/speaker_tasks/recognition/speaker_reco_finetune.py) to launch training in the command line. Following is just a demonstration of the script" ] }, { diff --git a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb index 98f0cce4e..5f5641d1f 100644 --- a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb +++ b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb @@ -35,7 +35,7 @@ "id": "d4KCUoxSpdoZ" }, "source": [ - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "\n", "\"\"\"\n", "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", @@ -126,7 +126,7 @@ "id": "S1DZk-inQGTI" }, "source": [ - "`TOOLS_DIR` contains scripts that we are going to need during the next steps, all necessary scripts could be found [here](https://github.com/NVIDIA/NeMo/tree/main/tools/ctc_segmentation/scripts)." + "`TOOLS_DIR` contains scripts that we are going to need during the next steps, all necessary scripts could be found [here](https://github.com/NVIDIA/NeMo/tree/r1.19.0/tools/ctc_segmentation/scripts)." ] }, { @@ -280,7 +280,7 @@ "* `max_length` argument - max number of words in a segment for alignment (used only if there are no punctuation marks present in the original text. Long non-speech segments are better for segments split and are more likely to co-occur with punctuation marks. Random text split could deteriorate the quality of the alignment.\n", "* out-of-vocabulary words will be removed based on pre-trained ASR model vocabulary, and the text will be changed to lowercase \n", "* sentences for alignment with the original punctuation and capitalization will be stored under `$OUTPUT_DIR/processed/*_with_punct.txt`\n", - "* numbers will be converted from written to their spoken form with `num2words` package. For English, it's recommended to use NeMo normalization tool use `--use_nemo_normalization` argument (not supported if running this segmentation tutorial in Colab, see the text normalization tutorial: [`https://github.com/NVIDIA/NeMo-text-processing/blob/main/tutorials/Text_(Inverse)_Normalization.ipynb`](https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/main/tutorials/Text_(Inverse)_Normalization.ipynb) for more details). Even `num2words` normalization is usually enough for proper segmentation. However, it does not take audio into account. NeMo supports audio-based normalization for English, German and Russian languages that can be applied to the segmented data as a post-processing step. Audio-based normalization produces multiple normalization options. For example, `901` could be normalized as `nine zero one` or `nine hundred and one`. The audio-based normalization chooses the best match among the possible normalization options and the transcript based on the character error rate. See [https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py](https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py) for more details.\n", + "* numbers will be converted from written to their spoken form with `num2words` package. For English, it's recommended to use NeMo normalization tool use `--use_nemo_normalization` argument (not supported if running this segmentation tutorial in Colab, see the text normalization tutorial: [`https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb`](https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb) for more details). Even `num2words` normalization is usually enough for proper segmentation. However, it does not take audio into account. NeMo supports audio-based normalization for English, German and Russian languages that can be applied to the segmented data as a post-processing step. Audio-based normalization produces multiple normalization options. For example, `901` could be normalized as `nine zero one` or `nine hundred and one`. The audio-based normalization chooses the best match among the possible normalization options and the transcript based on the character error rate. See [https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/nemo_text_processing/text_normalization/normalize_with_audio.py](https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/nemo_text_processing/text_normalization/normalize_with_audio.py) for more details.\n", "\n", "### Audio preprocessing:\n", "* non '.wav' audio files will be converted to `.wav` format\n", @@ -699,7 +699,7 @@ "source": [ "# Next Steps\n", "\n", - "- Check out [NeMo Speech Data Explorer tool](https://github.com/NVIDIA/NeMo/tree/main/tools/speech_data_explorer#speech-data-explorer) to interactively evaluate the aligned segments.\n", + "- Check out [NeMo Speech Data Explorer tool](https://github.com/NVIDIA/NeMo/tree/r1.19.0/tools/speech_data_explorer#speech-data-explorer) to interactively evaluate the aligned segments.\n", "- Try Audio-based normalization tool." ] }, diff --git a/tutorials/tools/Multispeaker_Simulator.ipynb b/tutorials/tools/Multispeaker_Simulator.ipynb index c2a9caf1e..8264854df 100644 --- a/tutorials/tools/Multispeaker_Simulator.ipynb +++ b/tutorials/tools/Multispeaker_Simulator.ipynb @@ -18,7 +18,7 @@ "\"\"\"\n", "\n", "NEMO_DIR_PATH = \"NeMo\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "\n", "! git clone https://github.com/NVIDIA/NeMo\n", "%cd NeMo\n", @@ -326,7 +326,7 @@ "outputs": [], "source": [ "if not os.path.exists(\"multispeaker_data_analysis.py\"):\n", - " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speaker_tasks/multispeaker_data_analysis.py\n", + " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/speaker_tasks/multispeaker_data_analysis.py\n", "\n", "from multispeaker_data_analysis import run_multispeaker_data_analysis\n", "\n", diff --git a/tutorials/tts/Aligner_Inference_Examples.ipynb b/tutorials/tts/Aligner_Inference_Examples.ipynb index 611e1e3b6..e113af556 100644 --- a/tutorials/tts/Aligner_Inference_Examples.ipynb +++ b/tutorials/tts/Aligner_Inference_Examples.ipynb @@ -39,7 +39,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "\"\"\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget text-unidecode\n", @@ -700,7 +700,7 @@ "## Resources\n", "\n", "- For more information about the Aligner architecture, check out the [RAD-TTS Aligner paper](https://arxiv.org/abs/2108.10447).\n", - "- If you would like to run disambiguation on a large batch of sentences, try out the [Aligner disambiguation example script](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/aligner_heteronym_disambiguation.py)." + "- If you would like to run disambiguation on a large batch of sentences, try out the [Aligner disambiguation example script](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/tts/aligner_heteronym_disambiguation.py)." ] }, { diff --git a/tutorials/tts/Evaluation_MelCepstralDistortion.ipynb b/tutorials/tts/Evaluation_MelCepstralDistortion.ipynb index 699f1b131..0f501f89a 100644 --- a/tutorials/tts/Evaluation_MelCepstralDistortion.ipynb +++ b/tutorials/tts/Evaluation_MelCepstralDistortion.ipynb @@ -57,7 +57,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "\"\"\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# If you're using Google Colab and not running locally, uncomment and run this cell.\n", "# !pip install librosa numpy matplotlib" ] @@ -601,9 +601,9 @@ "source": [ "## Additional NeMo Resources\n", "\n", - "If you are unsure where to begin for training a TTS model, you may want to start with the [FastPitch and Mixer-TTS Training notebook](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) or the [NeMo TTS Primer notebook](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/NeMo_TTS_Primer.ipynb). For fine-tuning, there is also the [FastPitch Fine-Tuning notebook](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_Finetuning.ipynb).\n", + "If you are unsure where to begin for training a TTS model, you may want to start with the [FastPitch and Mixer-TTS Training notebook](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) or the [NeMo TTS Primer notebook](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/NeMo_TTS_Primer.ipynb). For fine-tuning, there is also the [FastPitch Fine-Tuning notebook](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/FastPitch_Finetuning.ipynb).\n", "\n", - "For some guidance on how to load a trained model and perform inference to generate mels or waveforms, check out how it's done in the [Inference notebook](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/Inference_ModelSelect.ipynb). Important functions to know are include `from_pretrained()` (if loading from an NGC model) and `restore_from()` (if loading a `.nemo` file). See the [NeMo Primer notebook](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/00_NeMo_Primer.ipynb) for more general information about model training, saving, and loading." + "For some guidance on how to load a trained model and perform inference to generate mels or waveforms, check out how it's done in the [Inference notebook](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/Inference_ModelSelect.ipynb). Important functions to know are include `from_pretrained()` (if loading from an NGC model) and `restore_from()` (if loading a `.nemo` file). See the [NeMo Primer notebook](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/00_NeMo_Primer.ipynb) for more general information about model training, saving, and loading." ] } ], diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index fa1b1bdc9..95bc38050 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -59,7 +59,7 @@ "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", - "# BRANCH = 'main'\n", + "# BRANCH = 'r1.19.0'\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", @@ -246,7 +246,7 @@ "source": [ "### Extract Supplementary Data\n", "\n", - "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script." + "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script." ] }, { diff --git a/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb b/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb index 9c4ea4369..6685eca56 100644 --- a/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb @@ -61,7 +61,7 @@ "# !pip install wget text-unidecode matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# !python -m pip install \"git+https://github.com/NVIDIA/NeMo.git@${BRANCH}#egg=nemo_toolkit[all]\"\n", "\n", "## Install pynini\n", @@ -134,10 +134,10 @@ "\n", "FastPitch is non-autoregressive model for mel-spectrogram generation based on FastSpeech, conditioned on fundamental frequency contours. For more details about model, please refer to the original [paper](https://ieeexplore.ieee.org/abstract/document/9413889). Original [FastPitch model](https://ieeexplore.ieee.org/abstract/document/9413889) uses an external Tacotron 2 model trained on LJSpeech-1.1 to extract training alignments and estimate durations of input symbols. This implementation of FastPitch is based on [Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/FastPitch), which uses an alignment mechanism proposed in [RAD-TTS](https://openreview.net/pdf?id=0NQwnnwAORi) and extended in [TTS Aligner](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9747707).\n", "\n", - "For more information on training a basic FastPitch model, please refer to [FastPitch_MixerTTS_Training.ipynb](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) tutorial.\n", + "For more information on training a basic FastPitch model, please refer to [FastPitch_MixerTTS_Training.ipynb](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) tutorial.\n", "\n", "### HiFi-GAN\n", - "HiFi-GAN is a generative adversarial network (GAN) model that generates audio from mel spectrograms. The generator uses transposed convolutions to upsample mel spectrograms to audio. For more details about the model, please refer to the original [paper](https://arxiv.org/abs/2010.05646). NeMo re-implementation of HiFi-GAN can be found [here](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/models/hifigan.py)." + "HiFi-GAN is a generative adversarial network (GAN) model that generates audio from mel spectrograms. The generator uses transposed convolutions to upsample mel spectrograms to audio. For more details about the model, please refer to the original [paper](https://arxiv.org/abs/2010.05646). NeMo re-implementation of HiFi-GAN can be found [here](https://github.com/NVIDIA/NeMo/blob/r1.19.0/nemo/collections/tts/models/hifigan.py)." ] }, { @@ -780,7 +780,7 @@ "- Finetuning with #1 has artifacts from the original audio (noise) that get passed on as input to the vocoder resulting in artifacts in vocoder output in the form of noise.\n", "- On the other hand, #2.1 (i.e. `Mel spectrogram predicted from FastPitch with groundtruth alignment and duration`) gives the best results because it enables HiFi-GAN to learn mel spectrograms generated by FastPitch as well as duration distributions closer to the real world (i.e. ground truth) durations. \n", "\n", - "From implementation perspective - we follow the same process described in [Finetuning FastPitch for a new speaker](FastPitch_Finetuning.ipynb) - i.e. take the latest checkpoint from FastPitch training and predict spectrograms for each of the input records in `train_manifest.json`, `test_manifest.json` and `val_manifest.json`. NeMo provides an efficient script, [scripts/dataset_processing/tts/generate_mels.py](https://raw.githubusercontent.com/nvidia/NeMo/main/scripts/dataset_processing/tts/generate_mels.py), to generate Mel-spectrograms in the directory `NeMoChineseTTS/mels` and also create new JSON manifests with a suffix `_mel` by adding a new key `\"mel_filepath\"`. For example, `train_manifest.json` corresponds to `train_manifest_mel.json` saved in the same directory. You can run the following CLI to obtain the new JSON manifests." + "From implementation perspective - we follow the same process described in [Finetuning FastPitch for a new speaker](FastPitch_Finetuning.ipynb) - i.e. take the latest checkpoint from FastPitch training and predict spectrograms for each of the input records in `train_manifest.json`, `test_manifest.json` and `val_manifest.json`. NeMo provides an efficient script, [scripts/dataset_processing/tts/generate_mels.py](https://raw.githubusercontent.com/nvidia/NeMo/r1.19.0/scripts/dataset_processing/tts/generate_mels.py), to generate Mel-spectrograms in the directory `NeMoChineseTTS/mels` and also create new JSON manifests with a suffix `_mel` by adding a new key `\"mel_filepath\"`. For example, `train_manifest.json` corresponds to `train_manifest_mel.json` saved in the same directory. You can run the following CLI to obtain the new JSON manifests." ] }, { diff --git a/tutorials/tts/FastPitch_Finetuning.ipynb b/tutorials/tts/FastPitch_Finetuning.ipynb index 794d4b71f..cf9486226 100755 --- a/tutorials/tts/FastPitch_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Finetuning.ipynb @@ -57,7 +57,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "\"\"\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget text-unidecode \n", @@ -627,7 +627,7 @@ "id": "843674e7", "metadata": {}, "source": [ - "We can then finetune hifigan similarly to fastpitch using NeMo's [hifigan_finetune.py](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/hifigan_finetune.py) and [hifigan.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/conf/hifigan/hifigan.yaml):\n", + "We can then finetune hifigan similarly to fastpitch using NeMo's [hifigan_finetune.py](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/tts/hifigan_finetune.py) and [hifigan.yaml](https://github.com/NVIDIA/NeMo/blob/r1.19.0/examples/tts/conf/hifigan/hifigan.yaml):\n", "\n", "```bash\n", "python examples/tts/hifigan_finetune.py \\\n", diff --git a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb index e7cb0e896..7d1ce2658 100644 --- a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb @@ -61,7 +61,7 @@ "# !pip install wget text-unidecode matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# !python -m pip install \"git+https://github.com/NVIDIA/NeMo.git@${BRANCH}#egg=nemo_toolkit[all]\"\n", "\n", "## Install pynini\n", @@ -133,10 +133,10 @@ "\n", "FastPitch is non-autoregressive model for mel-spectrogram generation based on FastSpeech, conditioned on fundamental frequency contours. For more details about model, please refer to the original [paper](https://ieeexplore.ieee.org/abstract/document/9413889). Original [FastPitch model](https://ieeexplore.ieee.org/abstract/document/9413889) uses an external Tacotron 2 model trained on LJSpeech-1.1 to extract training alignments and estimate durations of input symbols. This implementation of FastPitch is based on [Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/FastPitch), which uses an alignment mechanism proposed in [RAD-TTS](https://openreview.net/pdf?id=0NQwnnwAORi) and extended in [TTS Aligner](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9747707).\n", "\n", - "For more information on training a basic FastPitch model, please refer to [FastPitch_MixerTTS_Training.ipynb](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) tutorial.\n", + "For more information on training a basic FastPitch model, please refer to [FastPitch_MixerTTS_Training.ipynb](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) tutorial.\n", "\n", "### HiFiGAN\n", - "HiFiGAN is a generative adversarial network (GAN) model that generates audio from mel spectrograms. The generator uses transposed convolutions to upsample mel spectrograms to audio. For more details about the model, please refer to the original [paper](https://arxiv.org/abs/2010.05646). NeMo re-implementation of HiFi-GAN can be found [here](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/models/hifigan.py)." + "HiFiGAN is a generative adversarial network (GAN) model that generates audio from mel spectrograms. The generator uses transposed convolutions to upsample mel spectrograms to audio. For more details about the model, please refer to the original [paper](https://arxiv.org/abs/2010.05646). NeMo re-implementation of HiFi-GAN can be found [here](https://github.com/NVIDIA/NeMo/blob/r1.19.0/nemo/collections/tts/models/hifigan.py)." ] }, { @@ -172,7 +172,7 @@ "3. `text`: original text;\n", "4. `normalized_text`: normalized text through our text normalization pipeline.\n", " \n", - "This script supports processing either of Thorsten's Neutral Datasets 21.02 or 22.10. In this tutorial, we only focus on the latest 22.10 version dataset. Please refer [thorsten-muller-s-german-neutral-tts-datasets](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/datasets.html#thorsten-muller-s-german-neutral-tts-datasets) for more details about Thorsten's datasets. \n", + "This script supports processing either of Thorsten's Neutral Datasets 21.02 or 22.10. In this tutorial, we only focus on the latest 22.10 version dataset. Please refer [thorsten-muller-s-german-neutral-tts-datasets](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/tts/datasets.html#thorsten-muller-s-german-neutral-tts-datasets) for more details about Thorsten's datasets. \n", "\n", "You can run the below command to obtain the final manifests, `train_manifest_text_normed.json`, `val_manifest_text_normed.json` and `test_manifest_text_normed.json`. **Note** that this script would take sometime (~2 hours) to download and normalize the entire dataset." ] @@ -649,7 +649,7 @@ "- Finetuning with #1 has artifacts from the original audio (noise) that get passed on as input to the vocoder resulting in artifacts in vocoder output in the form of noise.\n", "- On the other hand, #2.1 (i.e. `Mel spectrogram predicted from FastPitch with groundtruth alignment and duration`) gives the best results because it enables HiFi-GAN to learn mel spectrograms generated by FastPitch as well as duration distributions closer to the real world (i.e. ground truth) durations. \n", "\n", - "From implementation perspective - we follow the same process described in [Finetuning FastPitch for a new speaker](FastPitch_Finetuning.ipynb) - i.e. take the latest checkpoint from FastPitch training and predict spectrograms for each of the input records in `train_manifest_text_normed.json`, `test_manifest_text_normed.json` and `val_manifest_text_normed.json`. NeMo provides an efficient script, [scripts/dataset_processing/tts/generate_mels.py](https://raw.githubusercontent.com/nvidia/NeMo/main/scripts/dataset_processing/tts/generate_mels.py), to generate Mel-spectrograms in the directory `NeMoGermanTTS/mels` and also create new JSON manifests with a suffix `_mel` by adding a new key `\"mel_filepath\"`. For example, `train_manifest_text_normed.json` corresponds to `train_manifest_text_normed_mel.json` saved in the same directory. You can run the following CLI to obtain the new JSON manifests." + "From implementation perspective - we follow the same process described in [Finetuning FastPitch for a new speaker](FastPitch_Finetuning.ipynb) - i.e. take the latest checkpoint from FastPitch training and predict spectrograms for each of the input records in `train_manifest_text_normed.json`, `test_manifest_text_normed.json` and `val_manifest_text_normed.json`. NeMo provides an efficient script, [scripts/dataset_processing/tts/generate_mels.py](https://raw.githubusercontent.com/nvidia/NeMo/r1.19.0/scripts/dataset_processing/tts/generate_mels.py), to generate Mel-spectrograms in the directory `NeMoGermanTTS/mels` and also create new JSON manifests with a suffix `_mel` by adding a new key `\"mel_filepath\"`. For example, `train_manifest_text_normed.json` corresponds to `train_manifest_text_normed_mel.json` saved in the same directory. You can run the following CLI to obtain the new JSON manifests." ] }, { diff --git a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb index 558c0d95d..403faa965 100644 --- a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb @@ -50,7 +50,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget text-unidecode scipy==1.7.3\n", diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index defd0272d..a67744ef0 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -56,7 +56,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "# BRANCH = 'main'\n", + "# BRANCH = 'r1.19.0'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", @@ -258,7 +258,7 @@ "source": [ "### Extract Supplementary Data\n", "\n", - "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script." + "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script." ] }, { diff --git a/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb b/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb index eda5bba0a..5a7f56dc2 100644 --- a/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb +++ b/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb @@ -94,7 +94,7 @@ "source": [ "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n", "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, diff --git a/tutorials/tts/Inference_DurationPitchControl.ipynb b/tutorials/tts/Inference_DurationPitchControl.ipynb index 73c12bc79..d4e1b1ba0 100644 --- a/tutorials/tts/Inference_DurationPitchControl.ipynb +++ b/tutorials/tts/Inference_DurationPitchControl.ipynb @@ -46,7 +46,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "\"\"\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget text-unidecode\n", @@ -202,7 +202,7 @@ "\n", "Let's see the `pitch_predicted` for a sample text. You can run the below cell. You should get an image that looks like the following for the input `Hey, what is my pitch?`:\n", "\n", - "\n", + "\n", "\n", "Notice that the last word `pitch` has an increase in pitch to stress that it is a question." ] diff --git a/tutorials/tts/Inference_ModelSelect.ipynb b/tutorials/tts/Inference_ModelSelect.ipynb index 195b773fb..abdda3e16 100644 --- a/tutorials/tts/Inference_ModelSelect.ipynb +++ b/tutorials/tts/Inference_ModelSelect.ipynb @@ -46,7 +46,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "\"\"\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget text-unidecode\n", diff --git a/tutorials/tts/NeMo_TTS_Primer.ipynb b/tutorials/tts/NeMo_TTS_Primer.ipynb index 99306744d..497552a9a 100644 --- a/tutorials/tts/NeMo_TTS_Primer.ipynb +++ b/tutorials/tts/NeMo_TTS_Primer.ipynb @@ -25,7 +25,7 @@ "source": [ "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n", "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" ] }, diff --git a/tutorials/tts/Pronunciation_customization.ipynb b/tutorials/tts/Pronunciation_customization.ipynb index 6fe269e76..6185610fe 100644 --- a/tutorials/tts/Pronunciation_customization.ipynb +++ b/tutorials/tts/Pronunciation_customization.ipynb @@ -26,7 +26,7 @@ "4. Run this cell to set up dependencies.\n", "\"\"\"\n", "\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget text-unidecode \n", @@ -128,7 +128,7 @@ "metadata": {}, "source": [ "#### Expected results if you run the tutorial:\n", - " \n", + " \n", "\n", "\n", "During preprocessing, unambiguous dictionary words are converted to phonemes, while OOV and words with multiple entries are kept as graphemes. For example, **paracetamol** is missing from the phoneme dictionary, and **can** has 2 forms." @@ -186,7 +186,7 @@ "metadata": {}, "source": [ "#### Expected results if you run the tutorial:\n", - " \n", + " \n", "\n", "\n", "## Dictionary customization\n", @@ -212,7 +212,7 @@ "if os.path.exists(ipa_cmu_dict):\n", " ! rm $ipa_cmu_dict\n", "\n", - "! wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/$ipa_cmu_dict\n", + "! wget https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/scripts/tts_dataset_files/$ipa_cmu_dict\n", "\n", "with open(ipa_cmu_dict, \"a\") as f:\n", " f.write(f\"PARACETAMOL {new_pronunciation}\\n\")\n", @@ -267,7 +267,7 @@ "metadata": {}, "source": [ "#### Expected results if you run the tutorial:\n", - " " + " " ] }, { @@ -276,7 +276,7 @@ "source": [ "# Resources\n", "* [TTS pipeline customization](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/tts/tts-custom.html#tts-pipeline-configuration)\n", - "* [Overview of TTS in NeMo](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/NeMo_TTS_Primer.ipynb)\n", + "* [Overview of TTS in NeMo](https://github.com/NVIDIA/NeMo/blob/r1.19.0/tutorials/tts/NeMo_TTS_Primer.ipynb)\n", "* [G2P models in NeMo](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/g2p.html)\n", "* [Riva TTS documentation](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/tts/tts-overview.html)" ] diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb index e2ae5082e..a696ee26e 100644 --- a/tutorials/tts/Tacotron2_Training.ipynb +++ b/tutorials/tts/Tacotron2_Training.ipynb @@ -54,7 +54,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget text-unidecode\n", diff --git a/tutorials/tts/Vits_Training.ipynb b/tutorials/tts/Vits_Training.ipynb index 37e55e0d7..dbe4e9362 100644 --- a/tutorials/tts/Vits_Training.ipynb +++ b/tutorials/tts/Vits_Training.ipynb @@ -63,7 +63,7 @@ "# !pip install wget text-unidecode matplotlib>=3.3.2\n", "\n", "## Install NeMo\n", - "BRANCH = 'main'\n", + "BRANCH = 'r1.19.0'\n", "# !python -m pip install \"git+https://github.com/NVIDIA/NeMo.git@${BRANCH}#egg=nemo_toolkit[all]\"\n", "\n", "## Install pynini\n", From 1dc8b37bba47f0534400414f1d3f972778b9c7a5 Mon Sep 17 00:00:00 2001 From: Li Tao Date: Fri, 19 May 2023 21:04:59 +0800 Subject: [PATCH 02/35] Fix a bug, use _ceil_to_nearest instead as _round_to_nearest is not defined (#6681) --- .../nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py index deb6e77cd..2c896c2e6 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py @@ -183,7 +183,7 @@ def collate_fn(self, batch): if self.pad_to_max_length: max_length = self.max_seq_length else: - max_length = min(self.max_seq_length, self._round_to_nearest(max_length, 8)) + max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 8)) assert max_length <= self.max_seq_length attention_mask = [self._create_attention_mask(max_length) for _ in batch] From 0ca1dd3685995267b49193236a6ca0d27aea75a6 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Tue, 23 May 2023 21:19:09 +0400 Subject: [PATCH 03/35] Fix k2 installation in Docker with CUDA 12 (#6707) Signed-off-by: Vladimir Bataev --- scripts/speech_recognition/k2/setup.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/speech_recognition/k2/setup.sh b/scripts/speech_recognition/k2/setup.sh index 64d9a3c12..48ca31dab 100755 --- a/scripts/speech_recognition/k2/setup.sh +++ b/scripts/speech_recognition/k2/setup.sh @@ -15,10 +15,12 @@ # limitations under the License. K2_REPO=https://github.com/k2-fsa/k2 -LATEST_RELEASE=$(git -c 'versionsort.suffix=-' \ - ls-remote --exit-code --refs --sort='version:refname' --tags ${K2_REPO} '*.*' \ - | tail --lines=1 \ - | cut -d '/' -f 3) +LATEST_RELEASE=e5671de # Temporary fix for CUDA 12 +# uncomment the following line after the next k2 version is released (>1.24.3) +#LATEST_RELEASE=$(git -c 'versionsort.suffix=-' \ +# ls-remote --exit-code --refs --sort='version:refname' --tags ${K2_REPO} '*.*' \ +# | tail --lines=1 \ +# | cut -d '/' -f 3) # "cut --delimiter '/' --fields 3" doesn't work on macOS, use "-d ... -f ..." instead K2_MAKE_ARGS="-j" pip install -v "git+${K2_REPO}@${LATEST_RELEASE}#egg=k2" || { echo "k2 could not be installed!"; exit 1; } From db6e29b83573e9d132ccd06192732571ddb9f349 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Wed, 24 May 2023 15:21:36 -0700 Subject: [PATCH 04/35] Tutorial fixes (#6717) Signed-off-by: smajumdar --- tutorials/00_NeMo_Primer.ipynb | 2 +- tutorials/AudioTranslationSample.ipynb | 2 +- tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb | 2 +- tutorials/asr/Offline_ASR.ipynb | 2 +- tutorials/nlp/MegatronBert_export.ipynb | 2 +- tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb | 4 ++-- tutorials/nlp/Punctuation_and_Capitalization.ipynb | 2 +- .../nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb | 2 +- tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb | 2 +- tutorials/tools/CTC_Segmentation_Tutorial.ipynb | 2 +- tutorials/tts/FastPitch_GermanTTS_Training.ipynb | 2 +- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb index 193680f6d..2eff9c596 100644 --- a/tutorials/00_NeMo_Primer.ipynb +++ b/tutorials/00_NeMo_Primer.ipynb @@ -14,7 +14,7 @@ "\n", "The toolkit comes with extendable collections of pre-built modules and ready-to-use models for automatic speech recognition (ASR), natural language processing (NLP) and text synthesis (TTS). Built for speed, NeMo can utilize NVIDIA's Tensor Cores and scale out training to multiple GPUs and multiple nodes.\n", "\n", - "For more information, please visit https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/#" + "For more information, please visit https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/#" ] }, { diff --git a/tutorials/AudioTranslationSample.ipynb b/tutorials/AudioTranslationSample.ipynb index ac79ca3b2..b5c9d13a5 100644 --- a/tutorials/AudioTranslationSample.ipynb +++ b/tutorials/AudioTranslationSample.ipynb @@ -63,7 +63,7 @@ "import nemo\n", "# Import Speech Recognition collection\n", "import nemo.collections.asr as nemo_asr\n", - "# Import Natural Language Processing colleciton\n", + "# Import Natural Language Processing collection\n", "import nemo.collections.nlp as nemo_nlp\n", "# Import Speech Synthesis collection\n", "import nemo.collections.tts as nemo_tts\n", diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb index da2e53fd9..ede417d35 100644 --- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb +++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb @@ -627,7 +627,7 @@ "\n", "\n", "Eg: \n", - "Since this model was trained on publically available speech datasets, the performance of this model might degrade for speech which includes technical terms, or vernacular that the model has not been trained on. The model might also perform worse for accented speech.\n", + "Since this model was trained on publicly available speech datasets, the performance of this model might degrade for speech which includes technical terms, or vernacular that the model has not been trained on. The model might also perform worse for accented speech.\n", "\n", "\n", "## References\n", diff --git a/tutorials/asr/Offline_ASR.ipynb b/tutorials/asr/Offline_ASR.ipynb index 685d3ef6f..2d963a6b7 100644 --- a/tutorials/asr/Offline_ASR.ipynb +++ b/tutorials/asr/Offline_ASR.ipynb @@ -30,7 +30,7 @@ "* use beam search decoder with N-gram language model re-scoring\n", "\n", "You may find more info on how to train and use language models for ASR models here:\n", - "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/asr/asr_language_modeling.html\n", + "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html\n", "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n" ] }, diff --git a/tutorials/nlp/MegatronBert_export.ipynb b/tutorials/nlp/MegatronBert_export.ipynb index 5bec75028..c435d6e76 100644 --- a/tutorials/nlp/MegatronBert_export.ipynb +++ b/tutorials/nlp/MegatronBert_export.ipynb @@ -64,7 +64,7 @@ "\n", "If you prefer to use the Huggingface BERT models, please skip this section and refer to `Setting up a NeMo Experiment` section to load a model from `nemo_nlp.modules.get_pretrained_lm_models_list()`\n", "\n", - "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file." + "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file." ] }, { diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb index 02d533e59..7ccf33826 100644 --- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb +++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb @@ -45,7 +45,7 @@ "\n", "- Our p-tuning implementation is based off Liu et al's paper [GPT Understands, Too](https://arxiv.org/abs/2103.10385).\n", "\n", - "- Command line usage examples and API documentation can be found in [our user docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/nlp/nemo_megatron/prompt_learning.html). \n", + "- Command line usage examples and API documentation can be found in [our user docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/prompt_learning.html). \n", "\n", "\"Prompt\n", "\n", @@ -88,7 +88,7 @@ "# The Best of Both\n", "A single pretrained GPT model can use both p-tuning and prompt-tuning. While you must decide to use either p-tuning or prompt-tuning for each task you want your model to perform, you can p-tune your model on a set of tasks A, then prompt tune your same model on a different set of tasks B, then finally run inference on tasks from both A and B at the same time. During prompt-tuning or p-tuning, tasks tuned at the same time must use the same number of virtual tokens. During inference, tasks using differing amounts of virtual tokens can be run at the same time.\n", "\n", - "Please see our [docs for more comparisons between prompt and p-tuning](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/nlp/nemo_megatron/prompt_learning.html). \n", + "Please see our [docs for more comparisons between prompt and p-tuning](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/prompt_learning.html). \n", "\n", "With all that covered, let's get started!\n" ] diff --git a/tutorials/nlp/Punctuation_and_Capitalization.ipynb b/tutorials/nlp/Punctuation_and_Capitalization.ipynb index 54e424c83..ea6dc45ef 100644 --- a/tutorials/nlp/Punctuation_and_Capitalization.ipynb +++ b/tutorials/nlp/Punctuation_and_Capitalization.ipynb @@ -421,7 +421,7 @@ "\n", "- **trainer**: Any argument to be passed to PyTorch Lightning\n", "\n", - "See [docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/nlp/punctuation_and_capitalization.html#training-punctuation-and-capitalization-model) for full config description." + "See [docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#training-punctuation-and-capitalization-model) for full config description." ] }, { diff --git a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb index 3ce2ef9fb..62b3255d1 100644 --- a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb +++ b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb @@ -514,7 +514,7 @@ "\n", "- **trainer**: Any argument to be passed to PyTorch Lightning\n", "\n", - "See [docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/nlp/punctuation_and_capitalization.html#training-punctuation-and-capitalization-model) for full config description." + "See [docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#training-punctuation-and-capitalization-model) for full config description." ] }, { diff --git a/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb b/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb index 9ec4482f3..a1b0c4fd8 100644 --- a/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb +++ b/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb @@ -630,7 +630,7 @@ "source": [ "## Training Script\n", "\n", - "If you have NeMo installed locally, you can also train the model with [examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py](https://github.com/carolmanderson/NeMo/blob/r1.19.0/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py).\n", + "If you have NeMo installed locally, you can also train the model with [examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py](https://github.com/carolmanderson/NeMo/blob/main/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py).\n", "\n", "To run training script, use:\n", "\n", diff --git a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb index 5f5641d1f..5f1ffd27e 100644 --- a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb +++ b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb @@ -280,7 +280,7 @@ "* `max_length` argument - max number of words in a segment for alignment (used only if there are no punctuation marks present in the original text. Long non-speech segments are better for segments split and are more likely to co-occur with punctuation marks. Random text split could deteriorate the quality of the alignment.\n", "* out-of-vocabulary words will be removed based on pre-trained ASR model vocabulary, and the text will be changed to lowercase \n", "* sentences for alignment with the original punctuation and capitalization will be stored under `$OUTPUT_DIR/processed/*_with_punct.txt`\n", - "* numbers will be converted from written to their spoken form with `num2words` package. For English, it's recommended to use NeMo normalization tool use `--use_nemo_normalization` argument (not supported if running this segmentation tutorial in Colab, see the text normalization tutorial: [`https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb`](https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb) for more details). Even `num2words` normalization is usually enough for proper segmentation. However, it does not take audio into account. NeMo supports audio-based normalization for English, German and Russian languages that can be applied to the segmented data as a post-processing step. Audio-based normalization produces multiple normalization options. For example, `901` could be normalized as `nine zero one` or `nine hundred and one`. The audio-based normalization chooses the best match among the possible normalization options and the transcript based on the character error rate. See [https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/nemo_text_processing/text_normalization/normalize_with_audio.py](https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/nemo_text_processing/text_normalization/normalize_with_audio.py) for more details.\n", + "* numbers will be converted from written to their spoken form with `num2words` package. For English, it's recommended to use NeMo normalization tool use `--use_nemo_normalization` argument (not supported if running this segmentation tutorial in Colab, see the text normalization tutorial: [`https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb`](https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb) for more details). Even `num2words` normalization is usually enough for proper segmentation. However, it does not take audio into account. NeMo supports audio-based normalization for English, German and Russian languages that can be applied to the segmented data as a post-processing step. Audio-based normalization produces multiple normalization options. For example, `901` could be normalized as `nine zero one` or `nine hundred and one`. The audio-based normalization chooses the best match among the possible normalization options and the transcript based on the character error rate. See [https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py](https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/nemo_text_processing/text_normalization/normalize_with_audio.py) for more details.\n", "\n", "### Audio preprocessing:\n", "* non '.wav' audio files will be converted to `.wav` format\n", diff --git a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb index 7d1ce2658..512ec8249 100644 --- a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb @@ -172,7 +172,7 @@ "3. `text`: original text;\n", "4. `normalized_text`: normalized text through our text normalization pipeline.\n", " \n", - "This script supports processing either of Thorsten's Neutral Datasets 21.02 or 22.10. In this tutorial, we only focus on the latest 22.10 version dataset. Please refer [thorsten-muller-s-german-neutral-tts-datasets](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/r1.19.0/tts/datasets.html#thorsten-muller-s-german-neutral-tts-datasets) for more details about Thorsten's datasets. \n", + "This script supports processing either of Thorsten's Neutral Datasets 21.02 or 22.10. In this tutorial, we only focus on the latest 22.10 version dataset. Please refer [thorsten-muller-s-german-neutral-tts-datasets](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/datasets.html#thorsten-muller-s-german-neutral-tts-datasets) for more details about Thorsten's datasets. \n", "\n", "You can run the below command to obtain the final manifests, `train_manifest_text_normed.json`, `val_manifest_text_normed.json` and `test_manifest_text_normed.json`. **Note** that this script would take sometime (~2 hours) to download and normalize the entire dataset." ] From 2e2df4aed113156803a1d580d97c849118f94310 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Fri, 26 May 2023 09:13:49 -0700 Subject: [PATCH 05/35] VP Fixes for converter + Config management (#6698) (#6738) * [Temp] VP Fixes Signed-off-by: smajumdar * Revert logging Signed-off-by: smajumdar --------- Signed-off-by: smajumdar (cherry picked from commit b6f46a0f36659024bae04f24323a16aa8b09f45a) --- .../megatron_change_num_partitions.py | 99 ++++++++++++++++--- 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py index 558986e3d..2938a1609 100644 --- a/examples/nlp/language_modeling/megatron_change_num_partitions.py +++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py @@ -56,7 +56,7 @@ --target_pipeline_model_parallel_size=1 \ --target_pipeline_model_parallel_split_rank=0 \ --precision=bf16 - + # Megatron GPT + Virtual Pipeline parallelism python megatron_change_num_partitions.py \ @@ -138,17 +138,34 @@ def set_virtual_parallel_rank_safely(rank: int): def force_cpu_model(cfg): with open_dict(cfg): - # temporarily + # temporarily set to cpu original_cpu_init = cfg.get('use_cpu_initialization', False) - original_amp_o2 = cfg.get('megatron_amp_O2', False) + if 'megatron_amp_O2' in cfg: + key = 'megatron_amp_O2' + original_amp_o2 = cfg.megatron_amp_O2 + elif 'megatron_amp_02' in cfg: + key = 'megatron_amp_02' + original_amp_o2 = cfg.megatron_amp_02 + else: + key, original_amp_o2 = None, None + + # Set new values cfg.use_cpu_initialization = True - cfg.megatron_amp_O2 = False - return cfg, {'original_cpu_init': original_cpu_init, 'original_amp_o2': original_amp_o2} + if key is not None: + cfg[key] = False + + # Setup restore dict + restore_dict = {'use_cpu_initialization': original_cpu_init} # 'megatron_amp_O2': original_amp_o2 + if key is not None: + restore_dict[key] = original_amp_o2 + + return cfg, restore_dict def restore_model_config(cfg, original_dict): with open_dict(cfg): for key, val in original_dict.items(): + logging.info(f"Restoring model config key ({key}) from {cfg[key]} to original value of {val}") cfg[key] = val return cfg @@ -1034,6 +1051,8 @@ def main(): os.path.join(model_filepath, args.ckpt_name) ) + vp_state_dict = torch.load(checkpoint_path, map_location="cpu") + if hparams_filepath is not None: # Force the model onto CPU tmp_cfg = OmegaConf.load(hparams_filepath) @@ -1078,9 +1097,10 @@ def main(): vp_params_tmp = [] for vp_idx in range(vp_size): set_virtual_parallel_rank_safely(vp_idx) - params = [p for p in model.model[vp_idx].parameters()] - # params = model.model[vp_idx].module.state_dict_for_save_checkpoint() - # params = [p for p in params.values()] + vp_params = vp_state_dict[f'model{vp_idx}'] + model.model[vp_idx].module.load_state_dict(vp_params, strict=True) + model.model[vp_idx].module.to('cpu') + params = [p for p in model.model[vp_idx].module.parameters()] vp_params_tmp.append(params) # partitions[pp_rank][vp_idx].append(params) @@ -1141,6 +1161,8 @@ def main(): model = model.to('cpu') model._save_restore_connector = NLPSaveRestoreConnector() + restore_model_config(model.cfg, restore_dict) + vp_param_count = 0 for vp in range(vp_size): for pp in range(pp_size): @@ -1159,15 +1181,62 @@ def main(): else: flat_partitions = {idx: [] for idx in range(pp_size)} - for pp in range(pp_size): - for tp in range(tp_size): - vp_cache = [] - for vp in range(vp_size): - vp_cache.extend(partitions[vp][pp][tp]) + """ + Under VP convention + Notation : + Stage = PP rank + Number = GPT model / layer index + Ignore TP - every PP has all TP corresponding to that PP + chunk_index = the physical index of any [] in the list. Ex idx = 2 in below map corresponds to [2: PP 0 VP 1]] + + + For a PP 2 VP 4 model with 8 GPT layers- - flat_partitions[pp].append(vp_cache) + Indices + # Stage 0: [0:PP 0 VP 0] [2:PP 0 VP 1] [4:PP 0 VP 2] [6:PP 0 VP 3] + # Stage 1: [1:PP 1 VP 0] [3:PP 1 VP 1] [5:PP 1 VP 2] [7:PP 1 VP 3] + + after conversion will become + + # Stage 0: [0,1,2,3:PP 0] + # Stage 1: [4,5,6,7:PP 1] + + """ + pp_index = 0 + chunk_counter = 0 + tp_cache = [[] for _ in range(tp_size)] + + for vp in range(vp_size): + for pp in range(pp_size): + # Gather all TP under this VP PP combination. + # We will accumulate TP parameters from multiple layers in this cache. + for tp in range(tp_size): + tp_cache[tp].extend(partitions[vp][pp][tp]) + + # This counter indexes the global selection of a VP PP combination in the above map + chunk_counter += 1 + + # Log the mapping from old VP x PP to new PP index + logging.info(f"VP Conversion - vp: {vp} pp: {pp} -> pp_idx: {pp_index}") + + # Every vp_size chunks, we can fill a new PP index in the flat_partitions + if chunk_counter % vp_size == 0: + flat_partitions[pp_index].extend(tp_cache) + tp_cache = [[] for _ in range(tp_size)] + pp_index += 1 + + logging.debug( + f"VP merge step: \n" + f"vp: {vp} pp: {pp} pp_idx: {pp_index - 1} " + f"len(flat_partitions): {len(flat_partitions[pp_index - 1])}" + ) + + logging.debug(f"PP Size len(flat partitions) : {len(flat_partitions)}") + logging.debug(f"TP Size len(flat partitions[0]): {len(flat_partitions[0])}") + logging.debug(f"Layers len(flat partitions[0][0]) : {len(flat_partitions[0][0])}") partitions = flat_partitions + del tp_cache if tgt_tp_size > 1 or tgt_pp_size > 1: merge_partition(model, partitions) @@ -1175,8 +1244,6 @@ def main(): # Write out the PP 1 TP 1 model to disk merge_partition(model, partitions, args.target_file) - restore_model_config(model.cfg, restore_dict) - # Empty cache memory of all parameters from all PP TP partitions partitions.clear() From 4df8f33a3bb5de4a6e39f4b9ad31dd7d35739783 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Sat, 27 May 2023 06:29:54 +0800 Subject: [PATCH 06/35] Fix fastpitch test nightly (#6742) Signed-off-by: hsiehjackson --- .../collections/tts/models/test_fastpitch.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/collections/tts/models/test_fastpitch.py b/tests/collections/tts/models/test_fastpitch.py index c77d70cbc..2502b1f7c 100644 --- a/tests/collections/tts/models/test_fastpitch.py +++ b/tests/collections/tts/models/test_fastpitch.py @@ -16,7 +16,10 @@ This file implemented unit tests for loading all pretrained FastPitch NGC checkpoints and generating Mel-spectrograms. The test duration breakdowns are shown below. In general, each test for a single model is ~25 seconds on an NVIDIA RTX A6000. """ +import random + import pytest +import torch from nemo.collections.tts.models import FastPitchModel @@ -38,4 +41,23 @@ def test_inference(pretrained_model, language_specific_text_example): model, language_id = pretrained_model text = language_specific_text_example[language_id] parsed_text = model.parse(text) - _ = model.generate_spectrogram(tokens=parsed_text) + + # Multi-Speaker + speaker_id = None + reference_spec = None + reference_spec_lens = None + + if hasattr(model.fastpitch, 'speaker_emb'): + speaker_id = 0 + + if hasattr(model.fastpitch, 'speaker_encoder'): + if hasattr(model.fastpitch.speaker_encoder, 'lookup_module'): + speaker_id = 0 + if hasattr(model.fastpitch.speaker_encoder, 'gst_module'): + bs, lens, t_spec = parsed_text.shape[0], random.randint(50, 100), model.cfg.n_mel_channels + reference_spec = torch.rand(bs, lens, t_spec) + reference_spec_lens = torch.tensor([lens]).long().expand(bs) + + _ = model.generate_spectrogram( + tokens=parsed_text, speaker=speaker_id, reference_spec=reference_spec, reference_spec_lens=reference_spec_lens + ) From e806e1166684fcbac732fc0f664b4654b9b2e748 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Fri, 26 May 2023 16:41:46 -0600 Subject: [PATCH 07/35] check for first or last stage (#6708) * check for first or last stage Signed-off-by: ericharper * remove redundant check Signed-off-by: ericharper * fix typo Signed-off-by: ericharper * add map_location Signed-off-by: ericharper --------- Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_eval.py | 1 + .../modules/common/text_generation_utils.py | 65 ++++++++++--------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index 0ac155374..14cdbf8a7 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -203,6 +203,7 @@ def main(cfg) -> None: trainer=trainer, override_config_path=pretrained_cfg, save_restore_connector=save_restore_connector, + map_location=f'cuda:{trainer.local_rank}', # map_location is needed for converted models ) elif cfg.checkpoint_dir: app_state = AppState() diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index b39ac406d..8cfb02c5e 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -135,36 +135,41 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para def get_computeprob_response(tokenizer, response, inputs): - compute_prob_response = {} - new_token_ids = [] - new_tokens = [] - new_texts = [] - log_probs = [] - full_logprobs = [] - offsets = [] - for batch_id in range(len(response['tokens'])): - if isinstance(inputs, (list, tuple)): - if isinstance(inputs[0], str): - new_token_id = tokenizer.text_to_ids(inputs[batch_id]) - new_text = inputs[batch_id] - token_len = len(new_token_id) - elif isinstance(inputs[0], torch.Tensor): - token_len = int(inputs[1][batch_id].item()) - new_token_id = inputs[0][batch_id][:token_len].tolist() - new_text = tokenizer.ids_to_text(new_token_id) - new_token_ids.append(new_token_id) - new_tokens.append(response['tokens'][batch_id][:token_len]) - new_texts.append(new_text) - log_probs.append(response['logprob'][batch_id][:token_len]) - full_logprobs.append(response['full_logprob'][batch_id][:token_len]) - offsets.append(response['offsets'][batch_id][:-1]) - compute_prob_response['sentences'] = new_texts - compute_prob_response['tokens'] = new_tokens - compute_prob_response['token_ids'] = new_token_ids - compute_prob_response['logprob'] = log_probs - compute_prob_response['full_logprob'] = full_logprobs - compute_prob_response['offsets'] = offsets - return compute_prob_response + if parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage(): + # we only have a response on the first and last pipeline stages + compute_prob_response = {} + new_token_ids = [] + new_tokens = [] + new_texts = [] + log_probs = [] + full_logprobs = [] + offsets = [] + for batch_id in range(len(response['tokens'])): + if isinstance(inputs, (list, tuple)): + if isinstance(inputs[0], str): + new_token_id = tokenizer.text_to_ids(inputs[batch_id]) + new_text = inputs[batch_id] + token_len = len(new_token_id) + elif isinstance(inputs[0], torch.Tensor): + token_len = int(inputs[1][batch_id].item()) + new_token_id = inputs[0][batch_id][:token_len].tolist() + new_text = tokenizer.ids_to_text(new_token_id) + new_token_ids.append(new_token_id) + new_tokens.append(response['tokens'][batch_id][:token_len]) + new_texts.append(new_text) + log_probs.append(response['logprob'][batch_id][:token_len]) + full_logprobs.append(response['full_logprob'][batch_id][:token_len]) + offsets.append(response['offsets'][batch_id][:-1]) + compute_prob_response['sentences'] = new_texts + compute_prob_response['tokens'] = new_tokens + compute_prob_response['token_ids'] = new_token_ids + compute_prob_response['logprob'] = log_probs + compute_prob_response['full_logprob'] = full_logprobs + compute_prob_response['offsets'] = offsets + return compute_prob_response + else: + # intermediate stages + return None def get_batch(model, tokenizer, context_tokens): From dbd6a565992842bc8a04714c154b3e124da4c049 Mon Sep 17 00:00:00 2001 From: Markel Sanz Ausin Date: Mon, 29 May 2023 16:01:52 -0700 Subject: [PATCH 08/35] Bug fix to restore act ckpt (#6753) * Bug fix to restore act ckpt Signed-off-by: Markel Sanz Ausin * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Markel Sanz Ausin Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../language_modeling/megatron_gpt_model.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index e9545361b..809825752 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1143,16 +1143,20 @@ def _restore_activation_checkpointing_args(self): _reset_activation_checkpointing_args. """ # Restore config values. - self.cfg.activations_checkpoint_granularity = self.last_checkpointing_granularity - self.cfg.activations_checkpoint_method = self.last_checkpointing_method - self.cfg.activations_checkpoint_num_layers = self.last_checkpointing_num_layers + self.cfg.activations_checkpoint_granularity = self.last_activations_checkpoint_granularity + self.cfg.activations_checkpoint_method = self.last_activations_checkpoint_method + self.cfg.activations_checkpoint_num_layers = self.last_activations_checkpoint_num_layers self.cfg.activations_checkpoint_layers_per_pipeline = self.last_activations_checkpoint_layers_per_pipeline # Restore model parameters. for module in self.get_gpt_module_list(): - module.language_model.encoder.activations_checkpoint_granularity = self.last_checkpointing_granularity - module.language_model.encoder.activations_checkpoint_method = self.last_checkpointing_method - module.language_model.encoder.activations_checkpoint_num_layers = self.last_checkpointing_num_layers + module.language_model.encoder.activations_checkpoint_granularity = ( + self.last_activations_checkpoint_granularity + ) + module.language_model.encoder.activations_checkpoint_method = self.last_activations_checkpoint_method + module.language_model.encoder.activations_checkpoint_num_layers = ( + self.last_activations_checkpoint_num_layers + ) module.language_model.encoder.activations_checkpoint_layers_per_pipeline = ( self.last_activations_checkpoint_layers_per_pipeline ) From a0f757e257ad91fb842024d7f6a2d5a189338626 Mon Sep 17 00:00:00 2001 From: Markel Sanz Ausin Date: Wed, 31 May 2023 10:22:23 -0700 Subject: [PATCH 09/35] Bug fix to reset sequence parallelism (#6756) * Bug fix to reset sequence parallelism Signed-off-by: Markel Sanz Ausin * Update seq par reset/restore Signed-off-by: Markel Sanz Ausin * Add nested loop Signed-off-by: Markel Sanz Ausin --------- Signed-off-by: Markel Sanz Ausin --- .../models/language_modeling/megatron_gpt_model.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 809825752..66fa0ed27 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1170,12 +1170,13 @@ def _reset_sequence_parallelism_args(self): self.last_sequence_parallel = self.cfg.sequence_parallel # Reset config values. Needed for calling generate. - self.cfg.sequence_parallel = None + self.cfg.sequence_parallel = False # Reset model parameters. - for module in self.get_gpt_module_list(): - module.language_model.encoder.sequence_parallel = None + for mod in module.modules(): + if hasattr(mod, "sequence_parallel"): + mod.sequence_parallel = self.last_sequence_parallel def _restore_sequence_parallelism_args(self): """ Restores the sequence parallelism parameters using the values saved by @@ -1187,4 +1188,6 @@ def _restore_sequence_parallelism_args(self): # Restore model parameters. for module in self.get_gpt_module_list(): - module.language_model.encoder.sequence_parallel = self.last_sequence_parallel + for mod in module.modules(): + if hasattr(mod, "sequence_parallel"): + mod.sequence_parallel = self.last_sequence_parallel From 39dd654c6b37e42c35e14d2994caa1ed92c11c43 Mon Sep 17 00:00:00 2001 From: Abhinav Khattar Date: Wed, 31 May 2023 10:46:19 -0700 Subject: [PATCH 10/35] Fix checkpointed forward and add test for full activation checkpointing (#6744) * fix checkpointed forward and add test for full activation checkpointing Signed-off-by: Abhinav Khattar * add method Signed-off-by: Abhinav Khattar * add method Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar --- Jenkinsfile | 2 ++ nemo/collections/nlp/modules/common/megatron/transformer.py | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 27fbf1114..780e3e4b4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3175,6 +3175,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.hidden_size=256 \ model.num_attention_heads=8 \ model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ model.activations_checkpoint_num_layers=1 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" @@ -3211,6 +3212,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.hidden_size=256 \ model.num_attention_heads=8 \ model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ model.activations_checkpoint_num_layers=1 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 0f6112e08..9a09a9f9a 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -1268,9 +1268,6 @@ def custom_forward(*inputs): return custom_forward - # Make sure memory is freed. - tensor_parallel.reset_checkpointed_activations_memory_buffer() - if self.activations_checkpoint_method == 'uniform': # Uniformly divide the total number of Transformer layers and checkpoint # the input activation of each divided chunk. From 216bcabbab57f46f9f2b8cc4caba855fdf5da532 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Wed, 31 May 2023 15:54:55 -0700 Subject: [PATCH 11/35] Fix Links (#6777) Signed-off-by: smajumdar --- tutorials/tools/CTC_Segmentation_Tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb index 5f1ffd27e..4d64acedb 100644 --- a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb +++ b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb @@ -280,7 +280,7 @@ "* `max_length` argument - max number of words in a segment for alignment (used only if there are no punctuation marks present in the original text. Long non-speech segments are better for segments split and are more likely to co-occur with punctuation marks. Random text split could deteriorate the quality of the alignment.\n", "* out-of-vocabulary words will be removed based on pre-trained ASR model vocabulary, and the text will be changed to lowercase \n", "* sentences for alignment with the original punctuation and capitalization will be stored under `$OUTPUT_DIR/processed/*_with_punct.txt`\n", - "* numbers will be converted from written to their spoken form with `num2words` package. For English, it's recommended to use NeMo normalization tool use `--use_nemo_normalization` argument (not supported if running this segmentation tutorial in Colab, see the text normalization tutorial: [`https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb`](https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb) for more details). Even `num2words` normalization is usually enough for proper segmentation. However, it does not take audio into account. NeMo supports audio-based normalization for English, German and Russian languages that can be applied to the segmented data as a post-processing step. Audio-based normalization produces multiple normalization options. For example, `901` could be normalized as `nine zero one` or `nine hundred and one`. The audio-based normalization chooses the best match among the possible normalization options and the transcript based on the character error rate. See [https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py](https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/nemo_text_processing/text_normalization/normalize_with_audio.py) for more details.\n", + "* numbers will be converted from written to their spoken form with `num2words` package. For English, it's recommended to use NeMo normalization tool use `--use_nemo_normalization` argument (not supported if running this segmentation tutorial in Colab, see the text normalization tutorial: [`https://github.com/NVIDIA/NeMo-text-processing/blob/main/tutorials/Text_(Inverse)_Normalization.ipynb`](https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/main/tutorials/Text_(Inverse)_Normalization.ipynb) for more details). Even `num2words` normalization is usually enough for proper segmentation. However, it does not take audio into account. NeMo supports audio-based normalization for English, German and Russian languages that can be applied to the segmented data as a post-processing step. Audio-based normalization produces multiple normalization options. For example, `901` could be normalized as `nine zero one` or `nine hundred and one`. The audio-based normalization chooses the best match among the possible normalization options and the transcript based on the character error rate. See [https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py](https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py) for more details.\n", "\n", "### Audio preprocessing:\n", "* non '.wav' audio files will be converted to `.wav` format\n", From 4ecc769381d0a35f9249c02ccf26a3d8e72f98ca Mon Sep 17 00:00:00 2001 From: Abhinav Khattar Date: Thu, 1 Jun 2023 09:42:28 -0700 Subject: [PATCH 12/35] add call to p2p overlap (#6779) * add call to p2p overlap Signed-off-by: Abhinav Khattar * update Jenkins for test Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar --- Jenkinsfile | 9 +++++++++ .../nlp/models/language_modeling/megatron_gpt_model.py | 2 ++ 2 files changed, 11 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 780e3e4b4..7d0b8ee28 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -57,6 +57,15 @@ pipeline { } } + stage('Megatron Core installation') { + steps { + sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ + cd Megatron-LM && \ + git checkout e6d7e09845590d0a36bc7f29eb28db974fb8da4e && \ + pip install -e .' + } + } + stage('PyTorch Lightning version') { steps { sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 66fa0ed27..7b67f1602 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -371,6 +371,8 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): no_sync_func=no_sync_func, grad_sync_func=grad_sync_func, param_sync_func=param_sync_func, + overlap_p2p_comm=self.cfg.get('overlap_p2p_comm', False), + batch_p2p_comm=self.cfg.get('batch_p2p_comm', True), ) # only the last stages of the pipeline return losses From 1486b1239aa652bb9906f6d19c63a5a532621214 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Thu, 1 Jun 2023 11:05:55 -0600 Subject: [PATCH 13/35] Fix get_parameters when using main params optimizer (#6764) * fix get param Signed-off-by: ericharper * change name Signed-off-by: ericharper --------- Signed-off-by: ericharper --- .../models/language_modeling/megatron_base_model.py | 12 +++++++----- nemo/core/optim/optimizer_with_main_params.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 1237491fa..2aaedbe5a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -240,14 +240,16 @@ def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by ) return after - def _get_parameters(self): + def get_parameters_with_grad(self): """ - private method to load all the trainable parameters from optimizer param groups + Get all parameters with grad from optimizer param groups """ params = [] for param_group in self._optimizer_param_groups: for param in param_group['params']: - if param.requires_grad: # (@adithyare) adapter training with pp>1 can result in params with no grads + if ( + param.grad is not None + ): # (@adithyare) adapter training with pp>1 can result in params with no grads params.append(param) return params @@ -272,9 +274,9 @@ def configure_gradient_clipping(self, *args, **kwargs): else: if self.megatron_amp_o2: # grep fp32 master parameters for gradient clipping - parameters = self._optimizer.get_parameters() + parameters = self._optimizer.get_parameters_with_grad() else: - parameters = self._get_parameters() + parameters = self.get_parameters_with_grad() grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val) self.log('grad_norm', grad_norm, rank_zero_only=True, batch_size=1) diff --git a/nemo/core/optim/optimizer_with_main_params.py b/nemo/core/optim/optimizer_with_main_params.py index c9790ee2a..44d54a0e6 100644 --- a/nemo/core/optim/optimizer_with_main_params.py +++ b/nemo/core/optim/optimizer_with_main_params.py @@ -488,11 +488,11 @@ def async_master_grads_allreudce(self): def fp32_grad_accumulation(self): return self._fp32_grad_accum - def get_parameters(self): + def get_parameters_with_grad(self): params = [] for param_group in self.optimizer.param_groups: for param in param_group['params']: - if param.requires_grad: # (@adithyare) added to enable pp>1 training for adapters + if param.grad is not None: # (@adithyare) added to enable pp>1 training for adapters params.append(param) return params From aff5217f2149bee31355ee85ecdd0db14ce27eea Mon Sep 17 00:00:00 2001 From: wdykas <73254672+wdykas@users.noreply.github.com> Date: Thu, 1 Jun 2023 14:25:27 -0400 Subject: [PATCH 14/35] Lddl bert (#6761) * initial POC for LDDL Bert * Finish LDDL POC * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * address comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix merge head * resolving merge * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for val/test loaders * change to new LDDL class + add winding * fix logging level * fix winding * test fix * fixes to winding * add file system * add prepemption optimizations * more logging * more prints * better logging * asfsf * add barrier * removing prints * working with mb lddl loader * final changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update requirements file with LDDL Signed-off-by: wdykas * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert adding to requirements --------- Signed-off-by: wdykas Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- .../conf/megatron_bert_config.yaml | 2 +- .../megatron_bert_pretraining.py | 5 +- .../language_modeling/megatron_bert_model.py | 129 +++++++++++++++++- 3 files changed, 126 insertions(+), 10 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index cbc0562e2..a7e3364d4 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -133,7 +133,7 @@ model: seq_length: ${model.encoder_seq_length} skip_warmup: True num_workers: 0 - dataloader_type: single # cyclic + dataloader_type: single # cyclic, LDDL reset_position_ids: False # Reset position ids after end-of-document token reset_attention_mask: False # Reset attention mask after end-of-document token eod_mask_loss: False # Mask loss for the end of document tokens diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index e6abee295..5f0b74db9 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -29,11 +29,12 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="megatron_bert_config") def main(cfg) -> None: + if cfg.model.data.dataloader_type != "LDDL": + mp.set_start_method("spawn", force=True) + logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index 64430a669..cac1a50e9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -40,6 +40,7 @@ from nemo.core.neural_types import ChannelType, MaskType, NeuralType from nemo.utils import AppState, logging + try: from apex.transformer.pipeline_parallel.utils import get_num_microbatches @@ -49,6 +50,14 @@ HAVE_APEX = False +try: + import logging + from lddl.torch_mp import get_bert_pretrain_data_loader + + HAVE_LDDL = True +except (ImportError, ModuleNotFoundError): + HAVE_LDDL = False + try: from megatron.core import parallel_state from megatron.core.pipeline_parallel.schedules import get_forward_backward_func @@ -300,7 +309,12 @@ def training_step(self, dataloader_iter, batch_idx): for param in module.embedding.parameters(): param.data_ptr() - tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + if self.cfg.data.dataloader_type == "LDDL": + # this is of type bert dataset + seq_length = dataloader_iter.iterator.loaders.get_seqlen() + tensor_shape = [seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + else: + tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] # run forward and backwards passes for an entire global batch # we do this inside training_step to support pipeline parallelism @@ -324,7 +338,10 @@ def training_step(self, dataloader_iter, batch_idx): loss_tensor = torch.vstack(loss_tensors_list) loss_mean = loss_tensor.mean(axis=0) else: - loss_mean = torch.tensor([0.0, 0.0]).cuda() + if self.cfg.bert_binary_head == True: + loss_mean = torch.tensor([0.0, 0.0, 0.0]).cuda() + else: + loss_mean = torch.tensor([0.0, 0.0]).cuda() # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): @@ -404,7 +421,12 @@ def allreduce_first_last_embeddings(self): torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) def validation_step(self, dataloader_iter, batch_idx): - tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + + if self.cfg.data.dataloader_type == "LDDL": + seq_length = dataloader_iter.iterator.get_seqlen() + tensor_shape = [seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + else: + tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] fwd_bwd_function = get_forward_backward_func() @@ -476,6 +498,95 @@ def loss_func(self, loss_mask, sentence_order, output_tensor): # [lm_loss]) # return loss, {'lm loss': averaged_losses[0]} + def build_LDDL_data(self, cfg): + if not HAVE_LDDL: + raise ImportError( + "LDDL was not found. Please see the LDDL README for installation instructions: https://github.com/NVIDIA/LDDL#installation." + ) + logging.info(f'Starting building LDDL Dataloaders') + self._train_ds = None + self._validation_ds = None + self._test_ds = None + data_parallel_size = parallel_state.get_data_parallel_world_size() + num_micro_batches = self.cfg.global_batch_size // (self.cfg.micro_batch_size * data_parallel_size) + global_batch_size_on_this_data_parallel_rank = num_micro_batches * self.cfg.micro_batch_size + samples_consumed_dploader = self.compute_consumed_samples(0) // data_parallel_size + # We run under the assumption that the datapath is the prefix if LDDL dataloader + train_lddl_data_path = self.cfg.data.data_prefix[0] + self._train_dl = get_bert_pretrain_data_loader( + train_lddl_data_path, + dp_rank=parallel_state.get_data_parallel_rank(), + local_rank=self.local_rank, + shuffle_buffer_size=16384, + shuffle_buffer_warmup_factor=16, + vocab_file=self.cfg.tokenizer.vocab_file, + data_loader_kwargs={ + 'batch_size': global_batch_size_on_this_data_parallel_rank, + 'num_workers': self.cfg.data.num_workers, + 'prefetch_factor': 2, + }, + mlm_probability=0.15, + base_seed=self.cfg.seed, + log_level=logging.CRITICAL, + log_dir="/tmp/log", + return_raw_samples=False, + start_epoch=0, + sequence_length_alignment=8, + ignore_index=-1, + samples_seen=samples_consumed_dploader, + micro_batch_size=self.cfg.micro_batch_size, + ) + logging.info(f'Completed build train LDDL Dataloader') + if len(self.cfg.data.data_prefix) > 1: + val_lddl_data_path = self.cfg.data.data_prefix[1] + self._validation_dl = get_bert_pretrain_data_loader( + val_lddl_data_path, + dp_rank=parallel_state.get_data_parallel_rank(), + local_rank=self.local_rank, + shuffle_buffer_size=16384, + shuffle_buffer_warmup_factor=16, + vocab_file=self.cfg.tokenizer.vocab_file, + data_loader_kwargs={ + 'batch_size': global_batch_size_on_this_data_parallel_rank, + 'num_workers': self.cfg.data.num_workers, + 'prefetch_factor': 2, + }, + mlm_probability=0.15, + base_seed=self.cfg.seed, + log_level=logging.CRITICAL, + log_dir="/tmp/log", + return_raw_samples=False, + start_epoch=0, + sequence_length_alignment=8, + ignore_index=-1, + micro_batch_size=self.cfg.micro_batch_size, + ) + if len(self.cfg.data.data_prefix) > 2: + test_lddl_data_path = self.cfg.data.data_prefix[2] + self._test_dl = get_bert_pretrain_data_loader( + test_lddl_data_path, + dp_rank=parallel_state.get_data_parallel_rank(), + local_rank=self.local_rank, + shuffle_buffer_size=16384, + shuffle_buffer_warmup_factor=16, + vocab_file=self.cfg.tokenizer.vocab_file, + data_loader_kwargs={ + 'batch_size': global_batch_size_on_this_data_parallel_rank, + 'num_workers': self.cfg.data.num_workers, + 'prefetch_factor': 2, + }, + mlm_probability=0.15, + base_seed=self.cfg.seed, + log_level=logging.CRITICAL, + log_dir="/tmp/log", + return_raw_samples=False, + start_epoch=0, + sequence_length_alignment=8, + ignore_index=-1, + micro_batch_size=self.cfg.micro_batch_size, + ) + logging.info(f'Finished building LDDL Dataloaders') + def build_train_valid_test_datasets(self): logging.info('Building Bert datasets.') if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float): @@ -581,10 +692,14 @@ def setup(self, stage=None): else: # TODO: consider adding a ModelPT guard to check if model is being restored. # allowing restored models to optionally setup datasets - self.build_train_valid_test_datasets() - self.setup_training_data(self.cfg.data) - self.setup_validation_data(self.cfg.data) - self.setup_test_data(self.cfg.data) + if self.cfg.data.dataloader_type == "LDDL": + self.build_LDDL_data(self.cfg.data) + torch.distributed.barrier() + else: + self.build_train_valid_test_datasets() + self.setup_training_data(self.cfg.data) + self.setup_validation_data(self.cfg.data) + self.setup_test_data(self.cfg.data) # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: From 4bbb3c663b07045a88b9e095e890ce2e461efd6b Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Thu, 1 Jun 2023 12:07:10 -0700 Subject: [PATCH 15/35] Debug Transformer Engine FP8 support with Megatron-core infrastructure (#6740) * Construct FP8 amax reduction group Signed-off-by: Tim Moon * update core for CI Signed-off-by: Abhinav Khattar --------- Signed-off-by: Tim Moon Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar --- nemo/collections/nlp/parts/nlp_overrides.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index a43e06669..e7d74fb61 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -168,6 +168,7 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None: pipeline_model_parallel_size=app_state.pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=app_state.virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank, + use_fp8=app_state.use_fp8, ) # assert that fake tp and pp rank match after model parallel init @@ -405,7 +406,7 @@ class PEFTSaveRestoreConnector(NLPSaveRestoreConnector): Args: peft_model_nemo_path: Used to provide the .nemo file corresponding to a PEFT model (which will only contain a small set of params) peft_model_ckpt_path: Used to provide the path to .ckpt files of a PEFt model. This is required when no .nemo is available (yet) such as during resumed training. - If both are provided the peft_model_ckpt_path takes precedence. + If both are provided the peft_model_ckpt_path takes precedence. If neither are provided, PEFT params are initialized at random (not loaded from any external source). """ From e4460d1a8e728251aae87049ddeaf9af328cbc9c Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Thu, 1 Jun 2023 13:29:09 -0700 Subject: [PATCH 16/35] Tensor-parallel communication overlap with userbuffer backend (#6780) * add interfaces for tp_communication overlap [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Interface to provide custom userbuffer communicator settings by yaml file [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Construct MPI process group for userbuffers support Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Co-authored-by: Tim Moon Co-authored-by: Abhinav Khattar --- .../conf/megatron_gpt_config.yaml | 7 +++++ .../language_modeling/megatron/gpt_model.py | 2 ++ .../language_modeling/megatron_base_model.py | 9 ++++++ .../language_modeling/megatron_gpt_model.py | 28 +++++++++++++++++++ .../modules/common/megatron/language_model.py | 4 +++ .../modules/common/megatron/megatron_init.py | 2 ++ .../modules/common/megatron/transformer.py | 4 +++ nemo/collections/nlp/parts/nlp_overrides.py | 4 +++ nemo/utils/app_state.py | 17 +++++++++++ 9 files changed, 77 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index d502f255b..2135f1f0c 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -166,6 +166,13 @@ model: fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + ub_tp_comm_overlap: False + # Use userbuffer backend to overlap tensor-parallel communications with computes. + # This feature is only available with Transformer Engine and squence parallelism enabled and, currently, supports only GPT models. + ub_tp_comm_overlap_cfg: null + # A yaml file with userbuffer communicator configurations. This file should provide `method`, `dtype`, `num_sm`, `num_splits`, + # `cga_size`, `num_splits`, `set_sm_margin`, and `aggregate` for the communicators to use custom settings. + # If the configuration file is not provided a default setting is used for all communicators. data: # Path to data must be specified by the user. diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index e890e6ae4..6dc387466 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -163,6 +163,7 @@ def __init__( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + ub_tp_comm_overlap=False, ): super(GPTModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights) @@ -243,6 +244,7 @@ def __init__( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + ub_tp_comm_overlap=ub_tp_comm_overlap, ) if self.share_embeddings_and_output_weights: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 2aaedbe5a..563988323 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -123,6 +123,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): global_batch_size=cfg.get('global_batch_size'), rampup_batch_size=cfg.get('rampup_batch_size'), use_fp8=cfg.get('fp8', False), + init_mpi_proc_group=cfg.get('ub_tp_comm_overlap', False), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30), ) @@ -540,6 +541,14 @@ def _validate_and_override_config(self): 'Make sure the number of model chunks is the same across all pipeline stages.' ) + if self.cfg.get('ub_tp_comm_overlap', False): + if not self.cfg.get('transformer_engine', False) or not self.cfg.get('sequence_parallel', False): + logging.info( + "Userbuffer tensor-parallel communication overlap is available with both Transformer Engine and sequence-parallelism." + ) + with open_dict(self.cfg): + self.cfg.ub_tp_comm_overlap = False + def is_data_parallel_rank_zero(self): if is_global_rank_zero(): return True diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 7b67f1602..3f5dd8110 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -81,6 +81,7 @@ try: import transformer_engine + from transformer_engine.pytorch import module as te_module HAVE_TE = True @@ -179,6 +180,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self._nsys_profile_end_step *= grad_accum_steps self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True) + self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False) def get_gpt_module_list(self): if isinstance(self.model, list): @@ -254,6 +256,7 @@ def model_provider_func(self, pre_process, post_process): fp8_amax_compute_algo=self.cfg.get('fp8_amax_compute_algo', 'most_recent'), reduce_amax=self.cfg.get('reduce_amax', True), use_emha=self.cfg.get('use_emha', False), + ub_tp_comm_overlap=self.cfg.get('ub_tp_comm_overlap', False), ) return model @@ -410,6 +413,31 @@ def training_step(self, dataloader_iter, batch_idx): The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ + # Initialize userbuffer communicators. Initialization is done only once at the + # beginning of the first training step. + if self.initialize_ub: + input_shape = [ + self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'), + self.cfg.get('hidden_size'), + ] + ub_cfg_file_name = self.cfg.get('ub_tp_comm_overlap_cfg', None) + if ub_cfg_file_name is not None: + try: + import yaml + + with open(ub_cfg_file_name, 'r') as ub_cfg_file: + ub_cfgs = yaml.safe_load(ub_cfg_file) + except (ImportError, TypeError): + print("Fail to read ub_tp_comm_overlap config file.") + else: + ub_cfgs = None + te_module.initialize_ub( + shape=input_shape, + tp_size=self.cfg.get('tensor_model_parallel_size'), + use_fp8=self.cfg.get('fp8'), + ub_cfgs=ub_cfgs, + ) + self.initialize_ub = False # we zero grads here because we also call backward in the megatron-core fwd/bwd functions self._optimizer.zero_grad() diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py index b8b12cf0c..92a1b004b 100755 --- a/nemo/collections/nlp/modules/common/megatron/language_model.py +++ b/nemo/collections/nlp/modules/common/megatron/language_model.py @@ -116,6 +116,7 @@ def get_language_model( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + ub_tp_comm_overlap=False, ): """Build language model and return along with the key to save.""" @@ -191,6 +192,7 @@ def get_language_model( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + ub_tp_comm_overlap=ub_tp_comm_overlap, ) # key used for checkpoints. language_model_key = 'language_model' @@ -497,6 +499,7 @@ def __init__( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + ub_tp_comm_overlap=False, ): super(TransformerLanguageModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights) @@ -602,6 +605,7 @@ def __init__( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + ub_tp_comm_overlap=ub_tp_comm_overlap, ) self._encoder_key = 'encoder' diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index e0551fad5..7431bffad 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -67,6 +67,7 @@ def initialize_model_parallel_for_nemo( global_batch_size=None, rampup_batch_size=None, use_fp8=False, + init_mpi_proc_group=False, seed=1234, apex_transformer_log_level=30, ): @@ -83,6 +84,7 @@ def initialize_model_parallel_for_nemo( app_state.pipeline_model_parallel_size = pipeline_model_parallel_size app_state.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size app_state.use_fp8 = use_fp8 + app_state.init_mpi_proc_group = init_mpi_proc_group ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 9a09a9f9a..c57f3286e 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -792,6 +792,7 @@ def __init__( layer_type: str = "encoder", drop_path_rate: float = 0, use_emha: bool = False, + ub_tp_comm_overlap: bool = False, autocast_dtype: Any = 16, zero_centered_gamma: bool = False, ) -> None: @@ -824,6 +825,7 @@ def __init__( set_parallel_mode=tp_size > 1, fuse_qkv_params=True, zero_centered_gamma=zero_centered_gamma, + ub_tp_comm_overlap=ub_tp_comm_overlap, ) # use_emha=use_emha, @@ -919,6 +921,7 @@ def __init__( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + ub_tp_comm_overlap=False, normalize_attention_scores=True, multi_query_attention=False, num_moe_experts=1, @@ -1058,6 +1061,7 @@ def build_layer(layer_number): apply_residual_connection_post_layernorm=False, autocast_dtype=precision, use_emha=use_emha, + ub_tp_comm_overlap=ub_tp_comm_overlap, zero_centered_gamma=normalization == 'layernorm1p', ) else: diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index e7d74fb61..199a46be6 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -181,6 +181,10 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None: app_state.data_parallel_size = parallel_state.get_data_parallel_world_size() app_state.pipeline_model_parallel_group = parallel_state.get_pipeline_model_parallel_group() + # create MPI process group for UCX-based communication APIs + if app_state.init_mpi_proc_group: + torch.distributed.new_group(backend='mpi') + def save_checkpoint( self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None ) -> None: diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index c3ead0bff..d06e1ac32 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -55,6 +55,7 @@ def __init__(self): self._data_parallel_group = None self._megatron_checkpoint_version = None self._use_fp8 = False + self._init_mpi_proc_gruop = False self._random_seed = None @@ -363,6 +364,22 @@ def use_fp8(self, use_fp8): """ self._use_fp8 = use_fp8 + @property + def init_mpi_proc_group(self): + """ Property sets the initialization of mpi process group. + Returns: + Initialize mpi process group. + """ + return self._init_mpi_proc_group + + @init_mpi_proc_group.setter + def init_mpi_proc_group(self, init_mpi_proc_group): + """ Property sets the initialization of mpi process group. + Args: + init_mpi_proc_group: Initialize mpi process group. + """ + self._init_mpi_proc_group = init_mpi_proc_group + @property def random_seed(self): """ Property returns the random seed. From 9bd8ecd15e6b79ba85329a5f314b5de66444592e Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Thu, 1 Jun 2023 20:55:06 -0700 Subject: [PATCH 17/35] Fix adapter tutorial r1.19.0 (#6776) * Fix TTS adapter tutorial Signed-off-by: hsiehjackson * Fix version Signed-off-by: hsiehjackson --------- Signed-off-by: hsiehjackson --- nemo/collections/tts/modules/submodules.py | 16 +- .../tts/FastPitch_Adapter_Finetuning.ipynb | 178 +++++++----------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 8 +- 3 files changed, 78 insertions(+), 124 deletions(-) diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index 6efccf18e..408ab02de 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -758,15 +758,11 @@ def forward(self, batch_size=None, speaker=None, reference_spec=None, reference_ embs = self.lookup_module(speaker) # Get GST based speaker embedding - if self.gst_module is not None: - if reference_spec is None or reference_spec_lens is None: - raise ValueError( - "You should add `reference_audio` in sup_data_types or remove `speaker_encoder`in config." - ) - out = self.gst_module(reference_spec, reference_spec_lens) - embs = out if embs is None else embs + out - - elif self.gst_module is None and reference_spec is not None and reference_spec_lens is not None: - logging.warning("You may add `gst_module` in speaker_encoder to use reference_audio.") + if reference_spec is not None and reference_spec_lens is not None: + if self.gst_module is not None: + out = self.gst_module(reference_spec, reference_spec_lens) + embs = out if embs is None else embs + out + else: + logging.warning("You may add `gst_module` in speaker_encoder to use reference_audio.") return embs diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 95bc38050..67e274ff3 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ea49c0e5", + "id": "ed07e3c2", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -16,14 +16,14 @@ "2. **Fine-tune HiFiGAN on adaptation data**: fine-tune a vocoder for the fine-tuned multi-speaker FastPitch\n", "* Dataset Preparation: extract mel-spectrograms from fine-tuned FastPitch.\n", "* Training: fine-tune HiFiGAN with fine-tuned adaptation data.\n", - "3. **Inference**: generate speech from adpated FastPitch\n", + "3. **Inference**: generate speech from adapted FastPitch\n", "* Load Model: load pre-trained multi-speaker FastPitch with **fine-tuned adapters**.\n", "* Output Audio: generate audio files." ] }, { "cell_type": "markdown", - "id": "37259555", + "id": "772e7404", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d61cbea5", + "id": "8f799aa0", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fef9aba9", + "id": "0a4d3371", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49bc38ab", + "id": "25d94e3a", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9459f9dc", + "id": "79cb9932", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb26f54d", + "id": "ec7fed4e", "metadata": {}, "outputs": [], "source": [ @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "12b28329", + "id": "f815deff", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "30996769", + "id": "539e8f0d", "metadata": {}, "source": [ "# 1. Fine-tune FastPitch on adaptation data" @@ -157,17 +157,17 @@ }, { "cell_type": "markdown", - "id": "2f5f5945", + "id": "270ed53f", "metadata": {}, "source": [ "## a. Data Preparation\n", - "For our tutorial, we use small part of VCTK dataset with a new target speaker (p267). Usually, the audios should have total duration more than 15 mintues." + "For our tutorial, we use small part of VCTK dataset with a new target speaker (p267). Usually, the audios should have total duration more than 15 minutes." ] }, { "cell_type": "code", "execution_count": null, - "id": "8047f988", + "id": "21ce4a34", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b8242769", + "id": "2d5edbe5", "metadata": {}, "outputs": [], "source": [ @@ -188,7 +188,7 @@ { "cell_type": "code", "execution_count": null, - "id": "79cf8539", + "id": "c1de2249", "metadata": {}, "outputs": [], "source": [ @@ -198,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "35c3b97b", + "id": "e657c830", "metadata": {}, "source": [ "## b. Preprocessing" @@ -206,17 +206,17 @@ }, { "cell_type": "markdown", - "id": "ba3a7c3a", + "id": "4d0076d4", "metadata": {}, "source": [ "### Add absolute file path in manifest\n", - "We use absoluate path for audio_filepath to get the audio during training." + "We use absolute path for audio_filepath to get the audio during training." ] }, { "cell_type": "code", "execution_count": null, - "id": "8bc485b5", + "id": "7ccb5fb6", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9cb8ef5", + "id": "23dc1ba6", "metadata": {}, "outputs": [], "source": [ @@ -241,7 +241,7 @@ }, { "cell_type": "markdown", - "id": "f92054d5", + "id": "b852072b", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -252,7 +252,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0adc618b", + "id": "f6bdd226", "metadata": {}, "outputs": [], "source": [ @@ -267,7 +267,7 @@ }, { "cell_type": "markdown", - "id": "96dd5fe1", + "id": "fdae4e4e", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -280,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "23703c76", + "id": "ac8fae15", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "7c70e5db", + "id": "c9f98c86", "metadata": {}, "source": [ "## c. Model Setting\n", @@ -305,7 +305,7 @@ { "cell_type": "code", "execution_count": null, - "id": "439f2f82", + "id": "fd8c66fb", "metadata": {}, "outputs": [], "source": [ @@ -318,7 +318,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30f865cb", + "id": "ff535c8f", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +350,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e92910b5", + "id": "4f457111", "metadata": {}, "outputs": [], "source": [ @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "7f03219f", + "id": "ef40def3", "metadata": {}, "source": [ "### Precompute Speaker Embedding\n", @@ -370,7 +370,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c2a35241", + "id": "30664bcb", "metadata": {}, "outputs": [], "source": [ @@ -405,7 +405,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5fa1b309", + "id": "43001c75", "metadata": {}, "outputs": [], "source": [ @@ -417,7 +417,7 @@ }, { "cell_type": "markdown", - "id": "3b77e95f", + "id": "42915e02", "metadata": {}, "source": [ "## d. Training" @@ -426,21 +426,21 @@ { "cell_type": "code", "execution_count": null, - "id": "9e8c3740", + "id": "884bc2d0", "metadata": {}, "outputs": [], "source": [ "phoneme_dict_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n", "heteronyms_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n", "\n", - "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to overide pitch_mean and pitch_std configs below.\n", + "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to override pitch_mean and pitch_std configs below.\n", "PITCH_MEAN=175.48513793945312\n", "PITCH_STD=42.3786735534668" ] }, { "cell_type": "markdown", - "id": "19bb6d8b", + "id": "6f04fc86", "metadata": {}, "source": [ "### Important notes\n", @@ -451,13 +451,16 @@ "* Other optional arguments based on your preference:\n", " * batch_size\n", " * exp_manager\n", - " * trainer" + " * trainer\n", + " * model.unfreeze_aligner=true\n", + " * model.unfreeze_duration_predictor=true\n", + " * model.unfreeze_pitch_predictor=true" ] }, { "cell_type": "code", "execution_count": null, - "id": "8c8cbea2", + "id": "7ae8383a", "metadata": {}, "outputs": [], "source": [ @@ -476,9 +479,11 @@ "~model.speaker_encoder.gst_module \\\n", "model.train_ds.dataloader_params.batch_size=8 \\\n", "model.validation_ds.dataloader_params.batch_size=8 \\\n", + "+model.text_tokenizer.add_blank_at=True \\\n", "model.optim.name=adam \\\n", - "model.optim.lr=2e-4 \\\n", - "~model.optim.sched \\\n", + "model.optim.lr=1e-3 \\\n", + "model.optim.sched.warmup_steps=0 \\\n", + "+model.optim.sched.min_lr=1e-4 \\\n", "exp_manager.exp_dir={logs_dir} \\\n", "+exp_manager.create_wandb_logger=True \\\n", "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-finetune-adaptation\" \\\n", @@ -495,7 +500,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe5c7b2f", + "id": "39d3074c", "metadata": {}, "outputs": [], "source": [ @@ -510,7 +515,7 @@ }, { "cell_type": "markdown", - "id": "75856d0e", + "id": "9e9a1f45", "metadata": {}, "source": [ "# 3. Fine-tune HiFiGAN on adaptation data" @@ -518,7 +523,7 @@ }, { "cell_type": "markdown", - "id": "3444698f", + "id": "deec135f", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -528,7 +533,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bb2fd64d", + "id": "1aecaa68", "metadata": {}, "outputs": [], "source": [ @@ -554,7 +559,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da69cb66", + "id": "6a153ea0", "metadata": {}, "outputs": [], "source": [ @@ -564,7 +569,7 @@ }, { "cell_type": "markdown", - "id": "fa2cbb02", + "id": "b05cd550", "metadata": {}, "source": [ "## b. Training" @@ -573,7 +578,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ffdce5d5", + "id": "e5d5f281", "metadata": {}, "outputs": [], "source": [ @@ -601,7 +606,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e6376cf", + "id": "9c1c42f3", "metadata": {}, "outputs": [], "source": [ @@ -613,7 +618,7 @@ }, { "cell_type": "markdown", - "id": "e5076e51", + "id": "0665ac78", "metadata": {}, "source": [ "# 4. Inference" @@ -622,7 +627,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52358549", + "id": "5f4afb24", "metadata": {}, "outputs": [], "source": [ @@ -633,7 +638,7 @@ }, { "cell_type": "markdown", - "id": "9e96ee13", + "id": "0d9ff309", "metadata": {}, "source": [ "## a. Load Model" @@ -642,17 +647,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2cb5d524", - "metadata": {}, - "outputs": [], - "source": [ - "wave_model = WaveformFeaturizer(sample_rate=sample_rate)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32dbd30c", + "id": "81e4dee0", "metadata": {}, "outputs": [], "source": [ @@ -668,7 +663,7 @@ { "cell_type": "code", "execution_count": null, - "id": "74a7ad03", + "id": "1eaef8be", "metadata": {}, "outputs": [], "source": [ @@ -678,7 +673,7 @@ }, { "cell_type": "markdown", - "id": "4f882975", + "id": "837bdbab", "metadata": {}, "source": [ "## b. Output Audio" @@ -687,26 +682,14 @@ { "cell_type": "code", "execution_count": null, - "id": "2178a8ef", + "id": "fef139cb", "metadata": {}, "outputs": [], "source": [ - "def gt_spectrogram(audio_path, wave_model, spec_gen_model):\n", - " features = wave_model.process(audio_path, trim=False)\n", - " audio, audio_length = features, torch.tensor(features.shape[0]).long()\n", - " audio = audio.unsqueeze(0).to(device=spec_gen_model.device)\n", - " audio_length = audio_length.unsqueeze(0).to(device=spec_gen_model.device)\n", - " with torch.no_grad():\n", - " spectrogram, spec_len = spec_gen_model.preprocessor(input_signal=audio, length=audio_length)\n", - " return spectrogram, spec_len\n", - "\n", - "def gen_spectrogram(text, spec_gen_model, reference_spec, reference_spec_lens):\n", + "def gen_spectrogram(text, spec_gen_model):\n", " parsed = spec_gen_model.parse(text)\n", " with torch.no_grad(): \n", - " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, \n", - " reference_spec=reference_spec, \n", - " reference_spec_lens=reference_spec_lens)\n", - "\n", + " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed)\n", " return spectrogram\n", " \n", "def synth_audio(vocoder_model, spectrogram): \n", @@ -720,16 +703,10 @@ { "cell_type": "code", "execution_count": null, - "id": "766154e3", + "id": "b98ac280", "metadata": {}, "outputs": [], "source": [ - "# Reference Audio\n", - "with open(train_manifest, \"r\") as f:\n", - " for i, line in enumerate(f):\n", - " reference_record = json.loads(line)\n", - " break\n", - " \n", "# Validatation Audio\n", "num_val = 3\n", "val_records = []\n", @@ -743,27 +720,19 @@ { "cell_type": "code", "execution_count": null, - "id": "dfa71ca6", + "id": "b17446f9", "metadata": {}, "outputs": [], "source": [ "for i, val_record in enumerate(val_records):\n", - " reference_spec, reference_spec_lens = gt_spectrogram(reference_record['audio_filepath'], wave_model, spec_model)\n", - " reference_spec = reference_spec.to(spec_model.device)\n", - " spec_pred = gen_spectrogram(val_record['text'], spec_model,\n", - " reference_spec=reference_spec, \n", - " reference_spec_lens=reference_spec_lens)\n", - "\n", + " spec_pred = gen_spectrogram(val_record['text'], spec_model)\n", " audio_gen = synth_audio(vocoder_model, spec_pred)\n", - " \n", - " audio_ref = ipd.Audio(reference_record['audio_filepath'], rate=sample_rate)\n", + "\n", " audio_gt = ipd.Audio(val_record['audio_filepath'], rate=sample_rate)\n", " audio_gen = ipd.Audio(audio_gen, rate=sample_rate)\n", " \n", " print(\"------\")\n", " print(f\"Text: {val_record['text']}\")\n", - " print('Reference Audio')\n", - " ipd.display(audio_ref)\n", " print('Ground Truth Audio')\n", " ipd.display(audio_gt)\n", " print('Synthesized Audio')\n", @@ -775,18 +744,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51d9d176", - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Pretraind FastPitch: {pretrained_fastpitch_checkpoint}\")\n", - "print(f\"Finetuned Adapter: {finetuned_adapter_checkpoint}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6180a7d2", + "id": "f8f525d1", "metadata": {}, "outputs": [], "source": [ @@ -797,7 +755,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5b33263b", + "id": "66e8ab7d", "metadata": {}, "outputs": [], "source": [] @@ -819,7 +777,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index a67744ef0..1292cfcab 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -195,8 +195,8 @@ "id": "cae8567d", "metadata": {}, "source": [ - "### Add absoluate audio path in manifest\n", - "We use absoluate path for `audio_filepath` to get the audio during training." + "### Add absolute audio path in manifest\n", + "We use absolute path for `audio_filepath` to get the audio during training." ] }, { @@ -337,7 +337,7 @@ "phoneme_dict_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n", "heteronyms_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n", "\n", - "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to overide pitch_mean and pitch_std configs below.\n", + "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to override pitch_mean and pitch_std configs below.\n", "PITCH_MEAN=140.84278869628906\n", "PITCH_STD=65.4063949584961" ] @@ -727,7 +727,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.13" } }, "nbformat": 4, From 913e5e5fabd250d2442a0b9615fb84dbe0fb5598 Mon Sep 17 00:00:00 2001 From: Sandeep Subramanian Date: Fri, 2 Jun 2023 10:20:00 -0700 Subject: [PATCH 18/35] Fix check (#6798) Signed-off-by: MaximumEntropy --- .../nlp/data/language_modeling/megatron/gpt_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py index cf1de245d..d7113e7cd 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py @@ -601,7 +601,7 @@ def _build_index_mappings( last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one assert last_epoch_num_samples >= 0, 'last epoch number of samples should be non-negative.' num_samples_per_epoch = (tokens_per_epoch - add_extra_token) // seq_length - assert last_epoch_num_samples < ( + assert last_epoch_num_samples <= ( num_samples_per_epoch + 1 ), 'last epoch number of samples exceeded max value.' # If we have less than 80% of the samples for the last epoch, From a8aa8f126b651c5a6091561be46b94f80d08cb8b Mon Sep 17 00:00:00 2001 From: Markel Sanz Ausin Date: Fri, 2 Jun 2023 16:01:43 -0700 Subject: [PATCH 19/35] Bug fix for reset_sequence_parallel_args (#6802) Signed-off-by: Markel Sanz Ausin --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 3f5dd8110..7033d57a0 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1206,7 +1206,7 @@ def _reset_sequence_parallelism_args(self): for module in self.get_gpt_module_list(): for mod in module.modules(): if hasattr(mod, "sequence_parallel"): - mod.sequence_parallel = self.last_sequence_parallel + mod.sequence_parallel = False def _restore_sequence_parallelism_args(self): """ Restores the sequence parallelism parameters using the values saved by From 0e0253ea7b73bd09b6e14bca5b933bfe576a86ad Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Mon, 5 Jun 2023 08:36:29 -0700 Subject: [PATCH 20/35] Add ub communicator initialization to validation step (#6807) --- .../language_modeling/megatron_gpt_model.py | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 7033d57a0..6518fb796 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -407,37 +407,39 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): return loss_mean + def initialize_ub_func(self): + input_shape = [ + self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'), + self.cfg.get('hidden_size'), + ] + ub_cfg_file_name = self.cfg.get('ub_tp_comm_overlap_cfg', None) + if ub_cfg_file_name is not None: + try: + import yaml + + with open(ub_cfg_file_name, 'r') as ub_cfg_file: + ub_cfgs = yaml.safe_load(ub_cfg_file) + except (ImportError, TypeError): + print("Fail to read ub_tp_comm_overlap config file.") + else: + ub_cfgs = None + te_module.initialize_ub( + shape=input_shape, + tp_size=self.cfg.get('tensor_model_parallel_size'), + use_fp8=self.cfg.get('fp8'), + ub_cfgs=ub_cfgs, + ) + self.initialize_ub = False + def training_step(self, dataloader_iter, batch_idx): """ We pass the dataloader iterator function to the micro-batch scheduler. The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ - # Initialize userbuffer communicators. Initialization is done only once at the - # beginning of the first training step. + # Initialize userbuffer communicators. if self.initialize_ub: - input_shape = [ - self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'), - self.cfg.get('hidden_size'), - ] - ub_cfg_file_name = self.cfg.get('ub_tp_comm_overlap_cfg', None) - if ub_cfg_file_name is not None: - try: - import yaml - - with open(ub_cfg_file_name, 'r') as ub_cfg_file: - ub_cfgs = yaml.safe_load(ub_cfg_file) - except (ImportError, TypeError): - print("Fail to read ub_tp_comm_overlap config file.") - else: - ub_cfgs = None - te_module.initialize_ub( - shape=input_shape, - tp_size=self.cfg.get('tensor_model_parallel_size'), - use_fp8=self.cfg.get('fp8'), - ub_cfgs=ub_cfgs, - ) - self.initialize_ub = False + self.initialize_ub_func() # we zero grads here because we also call backward in the megatron-core fwd/bwd functions self._optimizer.zero_grad() @@ -762,6 +764,10 @@ def validation_step(self, dataloader_iter, batch_idx): from the dataloader to produce a list of microbatches. The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. """ + # Initialize userbuffer communicators. + if self.initialize_ub: + self.initialize_ub_func() + if isinstance(self.model, list): for model_module in self.model: model_module.eval() From 41bb941ecbc235c321642456d61d0d7c011ef5d4 Mon Sep 17 00:00:00 2001 From: Abhinav Khattar Date: Mon, 5 Jun 2023 22:54:02 -0700 Subject: [PATCH 21/35] update core version (#6817) Signed-off-by: Abhinav Khattar --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7d0b8ee28..e16bb5d66 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -61,7 +61,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout e6d7e09845590d0a36bc7f29eb28db974fb8da4e && \ + git checkout d2891b4ad3a00e3c4223f89491afd9e1b812f9b5 && \ pip install -e .' } } From 45144f53b2be2e9a31e8aa8647ae40426b91d5c3 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Tue, 6 Jun 2023 12:00:05 -0600 Subject: [PATCH 22/35] Add trainer.validate example for GPT (#6794) * add trainer.validate example Signed-off-by: ericharper * clean up white space Signed-off-by: ericharper * add mbs and gbs to the config Signed-off-by: ericharper --------- Signed-off-by: ericharper --- .../conf/megatron_gpt_validate_config.yaml | 22 +++ .../megatron_gpt_validate.py | 155 ++++++++++++++++++ .../language_modeling/megatron_gpt_model.py | 23 +-- 3 files changed, 189 insertions(+), 11 deletions(-) create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml create mode 100644 examples/nlp/language_modeling/megatron_gpt_validate.py diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml new file mode 100644 index 000000000..39b0c7ed2 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml @@ -0,0 +1,22 @@ +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: 16 # 16, 32, or bf16 + log_every_n_steps: 1 + limit_val_batches: 10 + limit_test_batches: 50 + max_steps: 100 # needed to setup dataloaders + max_epochs: null + replace_sampler_ddp: False + +tensor_model_parallel_size: ??? # should be set the same as the pretrained model that is being restored from +pipeline_model_parallel_size: ??? # should be set the same as the pretrained model that is being restored from +micro_batch_size: null # limited by GPU memory, defaults to pretrained model config +global_batch_size: null # will use more micro batches to reach global batch size, defaults to pretrained model config +virtual_pipeline_model_parallel_size: null +gpt_model_file: null # GPT nemo file path +checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training +checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading +hparams_file: null # model configuration file, only used for PTL checkpoint loading diff --git a/examples/nlp/language_modeling/megatron_gpt_validate.py b/examples/nlp/language_modeling/megatron_gpt_validate.py new file mode 100644 index 000000000..b5a61e627 --- /dev/null +++ b/examples/nlp/language_modeling/megatron_gpt_validate.py @@ -0,0 +1,155 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +from omegaconf import OmegaConf, open_dict +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.parts.nlp_overrides import ( + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.app_state import AppState +from nemo.utils.model_utils import inject_model_parallel_rank + +""" Example script showing how to run validation on a MegatronGPT model. + + Sample usage: + + From nemo model: + + python megatron_gpt_validate.py \ + trainer.devices=4 \ + trainer.num_nodes=1 \ + trainer.limit_val_batches=10 \ + trainer.max_steps=100 \ + tensor_model_parallel_size=1 \ + pipeline_model_parallel_size=4 \ + trainer.precision=bf16 \ + gpt_model_file=/path/to/megatron_gpt_tp_1_pp4.nemo + + from PTL checkpoint: + python megatron_gpt_validate.py \ + trainer.devices=4 \ + trainer.num_nodes=1 \ + trainer.limit_val_batches=10 \ + trainer.max_steps=100 \ + tensor_model_parallel_size=1 \ + pipeline_model_parallel_size=4 \ + virtual_pipeline_model_parallel_size=4 \ + trainer.precision=bf16 \ + checkpoint_dir='/path/to/experiment/checkpoints' \ + checkpoint_name='megatron_gpt--val_loss=7.78-step=100-consumed_samples=6336.0-last.ckpt' \ + hparams_file='/path/to/experiment/hparams.yaml + +""" + + +def modify_pretrained_cfg(pretrained_cfg, trainer, cfg): + with open_dict(pretrained_cfg): + OmegaConf.set_struct(pretrained_cfg, True) + pretrained_cfg.sequence_parallel = False + pretrained_cfg.activations_checkpoint_granularity = None + pretrained_cfg.activations_checkpoint_method = None + pretrained_cfg.precision = trainer.precision + if cfg.micro_batch_size is not None: + pretrained_cfg.micro_batch_size = cfg.micro_batch_size + if cfg.global_batch_size is not None: + pretrained_cfg.global_batch_size = cfg.global_batch_size + if trainer.precision == "16": + pretrained_cfg.megatron_amp_O2 = False + return pretrained_cfg + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_validate_config") +def main(cfg) -> None: + + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) + + assert ( + cfg.trainer.devices * cfg.trainer.num_nodes + == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" + + if cfg.gpt_model_file: + logging.info(f"Restoring model from {cfg.gpt_model_file}") + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file + + pretrained_cfg = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, + ) + pretrained_cfg = modify_pretrained_cfg(pretrained_cfg, trainer, cfg) + model = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + override_config_path=pretrained_cfg, + save_restore_connector=save_restore_connector, + map_location=f'cuda:{trainer.local_rank}', # map_location is needed for converted models + ) + elif cfg.checkpoint_dir: + logging.info( + f"Restoring model from checkpoint_dir: {cfg.checkpoint_dir} with checkpoint name: {cfg.checkpoint_name}" + ) + app_state = AppState() + if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: + app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size + app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size + app_state.virtual_pipeline_model_parallel_size = cfg.virtual_pipeline_model_parallel_size + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size_=cfg.virtual_pipeline_model_parallel_size, + ) + checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) + pretrained_cfg = OmegaConf.load(cfg.hparams_file) + pretrained_cfg = modify_pretrained_cfg(pretrained_cfg.cfg, trainer, cfg) + with tempfile.NamedTemporaryFile(suffix='.yaml') as f: + OmegaConf.save(config=pretrained_cfg, f=f.name) + model = MegatronGPTModel.load_from_checkpoint( + checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name, + ) + else: + raise ValueError("need at least a nemo file or checkpoint dir") + + logging.info("\n\n************** Model configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(model.cfg)}') + + trainer.validate(model=model) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 6518fb796..a0b9d215f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -361,7 +361,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): # TODO @akhattar: add num_micro_batches_with_partial_activation_checkpoints when ready losses_reduced_per_micro_batch = fwd_bwd_function( - forward_step_func=self.get_forward_output_and_loss_func(), + forward_step_func=self.get_forward_output_and_loss_func(forward_only), data_iterator=self._make_data_iterator_list(dataloader_iter), model=self.model, num_microbatches=get_num_microbatches(), @@ -956,17 +956,18 @@ def setup(self, stage=None): self.setup_validation_data(self.cfg.data) self.setup_test_data(self.cfg.data) - # when using pipeline model parallel the final stage need to initialize word embeddings - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - if isinstance(self.model, list): - for i, module in enumerate(self.model): - parallel_state.set_virtual_pipeline_model_parallel_rank(i) + if stage == 'fit': + # when using pipeline model parallel the final stage need to initialize word embeddings + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + if isinstance(self.model, list): + for i, module in enumerate(self.model): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + if self.cfg.get('share_embeddings_and_output_weights', True): + module.sync_initial_word_embeddings() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + else: if self.cfg.get('share_embeddings_and_output_weights', True): - module.sync_initial_word_embeddings() - parallel_state.set_virtual_pipeline_model_parallel_rank(0) - else: - if self.cfg.get('share_embeddings_and_output_weights', True): - self.model.sync_initial_word_embeddings() + self.model.sync_initial_word_embeddings() if self.cfg.get('transformer_engine', False): self.setup_transformer_engine_tp_groups() From dc52b949617772d23366ae4c485ee0a43c3d5f99 Mon Sep 17 00:00:00 2001 From: Yi Dong <43824965+yidong72@users.noreply.github.com> Date: Thu, 8 Jun 2023 15:48:32 -0400 Subject: [PATCH 23/35] fix notebook error (#6840) Signed-off-by: Yi Dong --- ...on_Synthetic_Tabular_Data_Generation.ipynb | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb index a92317b17..1d9849467 100644 --- a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb +++ b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "c3217a15", "metadata": {}, @@ -15,6 +16,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8c72dc42", "metadata": {}, @@ -25,6 +27,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "79154a9e", "metadata": {}, @@ -73,6 +76,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7e0bbc89", "metadata": {}, @@ -92,6 +96,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1ff1d46f", "metadata": {}, @@ -141,6 +146,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "aa356012", "metadata": {}, @@ -239,6 +245,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "02bff63f", "metadata": {}, @@ -267,6 +274,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "89e1e5b3", "metadata": {}, @@ -339,6 +347,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "05ebadc3", "metadata": {}, @@ -347,6 +356,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2fe38a29", "metadata": {}, @@ -381,6 +391,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "678f65ef", "metadata": {}, @@ -411,6 +422,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8af66b4a", "metadata": {}, @@ -464,6 +476,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6ecec681", "metadata": {}, @@ -472,6 +485,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "58a3d4fa", "metadata": {}, @@ -543,6 +557,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "45ac928f", "metadata": {}, @@ -557,6 +572,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "158a4bbe", "metadata": {}, @@ -586,7 +602,7 @@ "outputs": [], "source": [ "CHECKPONT_FILE_NAME = megatron_gpt--val_loss=1.17-step=10047-consumed_samples=80376.0-last.ckpt # change it to your checkpoint file name\n", - "!python -m torch.distributed.launch --nproc_per_node=1 megatron_ckpt_to_nemo.py \\\n", + "!python -m torch.distributed.launch --nproc_per_node=1 --use-env=True megatron_ckpt_to_nemo.py \\\n", " --checkpoint_folder=gpt_creditcard_results/megatron_gpt/checkpoints/ \\\n", " --checkpoint_name={CHECKPONT_FILE_NAME} \\\n", " --nemo_file_path=tabular.nemo \\\n", @@ -597,6 +613,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fa16378e", "metadata": {}, @@ -605,6 +622,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ed056ec6", "metadata": {}, @@ -630,6 +648,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a62b48dc", "metadata": {}, @@ -685,6 +704,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cccd54d9", "metadata": {}, @@ -790,6 +810,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0f2f6e3a", "metadata": {}, From 4239b8003c18a4de99272b826f4683590b57e4a5 Mon Sep 17 00:00:00 2001 From: Yi Dong <43824965+yidong72@users.noreply.github.com> Date: Thu, 8 Jun 2023 19:05:23 -0400 Subject: [PATCH 24/35] fix (#6842) Signed-off-by: Yi Dong --- tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb index 1d9849467..84ecdec36 100644 --- a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb +++ b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb @@ -602,7 +602,7 @@ "outputs": [], "source": [ "CHECKPONT_FILE_NAME = megatron_gpt--val_loss=1.17-step=10047-consumed_samples=80376.0-last.ckpt # change it to your checkpoint file name\n", - "!python -m torch.distributed.launch --nproc_per_node=1 --use-env=True megatron_ckpt_to_nemo.py \\\n", + "!torchrun --nproc_per_node=1 megatron_ckpt_to_nemo.py \\\n", " --checkpoint_folder=gpt_creditcard_results/megatron_gpt/checkpoints/ \\\n", " --checkpoint_name={CHECKPONT_FILE_NAME} \\\n", " --nemo_file_path=tabular.nemo \\\n", From 87e1b8180a2e4e31dcee4deb43bda56a75d2a53c Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Tue, 13 Jun 2023 11:25:07 -0600 Subject: [PATCH 25/35] Add API docs for NeMo Megatron (#6850) * add model pretraining and customization classes Signed-off-by: ericharper * fix Signed-off-by: ericharper * test width Signed-off-by: ericharper * increase middle pane width Signed-off-by: ericharper * add modules and datasets Signed-off-by: ericharper * remove global in t5 dataset s and fix formatting in megatron base model Signed-off-by: ericharper --------- Signed-off-by: ericharper --- docs/source/_static/css/custom.css | 2 +- docs/source/conf.py | 5 +- docs/source/nlp/api.rst | 193 +++++++++++------- .../language_modeling/megatron_base_model.py | 25 +-- 4 files changed, 135 insertions(+), 90 deletions(-) diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css index da134a02d..cf0ad0ff2 100644 --- a/docs/source/_static/css/custom.css +++ b/docs/source/_static/css/custom.css @@ -255,7 +255,7 @@ article ul { } } -@media (min-width: 1400px) { +@media (min-width: none) { body { font-size: 18px; } diff --git a/docs/source/conf.py b/docs/source/conf.py index a78ba3528..0765f8940 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -28,7 +28,6 @@ sys.path.insert(0, os.path.abspath("../..")) sys.path.insert(0, os.path.abspath("../../nemo")) -sys.path.insert(0, os.path.abspath("../../nemo_text_processing")) from package_info import __version__ @@ -47,7 +46,6 @@ 'hydra', # hydra-core in requirements, hydra during import 'dateutil', # part of core python 'transformers.tokenization_bert', # has ., troublesome for this regex - 'megatron', # megatron-lm in requirements, megatron in import 'sklearn', # scikit_learn in requirements, sklearn in import 'nemo_text_processing.inverse_text_normalization', # Not installed automatically 'nemo_text_processing.text_normalization', # Not installed automatically @@ -55,10 +53,13 @@ 'torchmetrics', # inherited from PTL 'lightning_utilities', # inherited from PTL 'apex', + 'megatron.core', + 'transformer_engine', 'joblib', # inherited from optional code 'IPython', 'ipadic', 'psutil', + 'regex', ] _skipped_autodoc_mock_imports = ['wrapt', 'numpy'] diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst index 46efb0851..7c6971a68 100755 --- a/docs/source/nlp/api.rst +++ b/docs/source/nlp/api.rst @@ -1,99 +1,142 @@ -NeMo NLP collection API +NeMo Megatron API ======================= -Model Classes -------------- +Pretraining Model Classes +------------------------- + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_base_model.MegatronBaseModel + :show-inheritance: + :no-members: + :members: __init__, configure_optimizers + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel + :show-inheritance: + :no-members: + :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bert_model.MegatronBertModel + :show-inheritance: + :no-members: + :members: training_step, validation_step, build_train_valid_test_datasets, build_LDDL_data, setup, on_save_checkpoint, on_load_checkpoint + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bart_model.MegatronBARTModel + :show-inheritance: + :no-members: + :members: training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_retrieval_model.MegatronRetrievalModel + :show-inheritance: + :no-members: + :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model + :show-inheritance: + :no-members: + :members: complete, encode, decode, add_special_tokens_to_tokenizer, training_step, validation_step, build_train_valid_test_datasets, setup + +Customization Model Classes +--------------------------- + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model.MegatronGPTSFTModel + :show-inheritance: + :no-members: + :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTAdapterLearningModel + :show-inheritance: + :no-members: + :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTInfusedAdapterModel + :show-inheritance: + :no-members: + :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model.MegatronGPTPromptLearningModel + :show-inheritance: + :no-members: + :members: built_virtual_prompt_dataset, generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel + :show-inheritance: + :no-members: + :members: __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel + :show-inheritance: + :no-members: + :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5InfusedAdapterModel + :show-inheritance: + :no-members: + :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup -.. autoclass:: nemo.collections.nlp.models.TextClassificationModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact, classifytext +Modules +------- -.. autoclass:: nemo.collections.nlp.models.GLUEModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact +.. autoclass:: nemo.collections.nlp.modules.common.megatron.module.MegatronModule + :show-inheritance: -.. autoclass:: nemo.collections.nlp.models.PunctuationCapitalizationModel - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.modules.common.megatron.module.Float16Module + :show-inheritance: -.. autoclass:: nemo.collections.nlp.models.TokenClassificationModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact - -.. autoclass:: nemo.collections.nlp.models.QAModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, inference, validation_epoch_end, test_epoch_end -.. autoclass:: nemo.collections.nlp.models.DuplexTaggerModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, inference, validation_epoch_end, test_epoch_end +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.gpt_model.GPTModel + :show-inheritance: + :no-members: + :members: forward -.. autoclass:: nemo.collections.nlp.models.DuplexDecoderModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, inference, validation_epoch_end, test_epoch_end +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert_model.BertModel + :show-inheritance: + :no-members: + :members: forward -.. autoclass:: nemo.collections.nlp.models.BERTLMModel - :show-inheritance: - :members: setup_training_data, setup_optimization +.. autoclass:: nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder.MegatronTokenLevelEncoderDecoderModule + :show-inheritance: + :no-members: + :members: forward -Modules -------- +.. autoclass:: nemo.collections.nlp.modules.common.megatron.retrieval_token_level_encoder_decoder.MegatronRetrievalTokenLevelEncoderDecoderModule + :show-inheritance: + :no-members: + :members: forward -.. autoclass:: nemo.collections.nlp.modules.BertModule - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.nlp.modules.AlbertEncoder - :show-inheritance: - :members: -.. autoclass:: nemo.collections.nlp.modules.BertEncoder - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.nlp.modules.DistilBertEncoder - :show-inheritance: - :members: +Datasets +-------- -.. autoclass:: nemo.collections.nlp.modules.RobertaEncoder - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset.BlendableDataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.modules.SequenceClassifier - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset.GPTDataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.modules.SequenceRegression - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset.MockGPTDataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.modules.SequenceTokenClassifier - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.bert_dataset.BertDataset + :show-inheritance: -.. autofunction:: nemo.collections.nlp.modules.get_lm_model +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.base_prompt_learning_dataset.BasePromptLearningDataset + :show-inheritance: -.. autofunction:: nemo.collections.nlp.modules.get_pretrained_lm_models_list +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset + :show-inheritance: -.. autofunction:: nemo.collections.nlp.modules.common.megatron.get_megatron_lm_models_list +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTChatDataset + :show-inheritance: -Datasets --------- +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.retro_dataset.RETRODataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset - :show-inheritance: - :members: - :special-members: __getitem__ +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.t5_dataset.T5Dataset + :show-inheritance: + :exclude-members: MAX_SEQ_LENGTH_DELTA -.. autofunction:: nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.create_tarred_dataset +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.t5_prompt_learning_dataset.T5PromptLearningDataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset - :show-inheritance: - :members: - :special-members: __iter__ - :exclude-members: reinforce_type +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.ul2_dataset.UL2Dataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.data.token_classification.punctuation_capitalization_infer_dataset.BertPunctuationCapitalizationInferDataset - :show-inheritance: - :members: - :special-members: __getitem__ diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 563988323..b96a5adb0 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -60,18 +60,19 @@ class MegatronBaseModel(NLPModel): """ - Megatron base class - It does the following things: - 1. Initialize the model parallel for nemo given the model parallel parameters. - 2. Turn on all the nvidia optimizations. - 3. If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the correct size for tensor model parallelism. - 4. If using distributed optimizer, configure to be compatible with - O2-level optimizations and/or model parallelism. - 5. Perform gradient clipping: `grad_clip_pl_default` triggers the - PyTorch Lightning default implementation, `with_distributed_adam` - triggers the distributed optimizer's implementation, - `megatron_amp_o2` triggers gradient clipping on the main grads, - and otherwise gradient clipping is performed on the model grads. + Megatron base class. All NeMo Megatron models inherit from this class. + + - Initialize the model parallel world for nemo. + - Turn on all of the nvidia optimizations. + - If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the + correct size for tensor model parallelism. + - If using distributed optimizer, configure to be compatible + with O2 level optimizations and/or model parallelism. + - Perform gradient clipping: `grad_clip_pl_default` triggers + the PyTorch Lightning default implementation, `with_distributed_adam` triggers + the distributed optimizer's implementation, `megatron_amp_o2` triggers gradient clipping on the main grads, + and otherwise gradient clipping is performed on the model grads. + """ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): From f8757027193aea5f7165d8f21c81baee9e5643fc Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Wed, 14 Jun 2023 15:50:36 -0700 Subject: [PATCH 26/35] Apply garbage collection interval to validation steps (#6870) * Apply garbage collection inverval to validation steps Signed-off-by: Sangkug Lym * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Sangkug Lym Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../language_modeling/megatron_base_model.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index b96a5adb0..f3ae5a938 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -157,6 +157,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): # The automatic garbage collector sould be disabled before training starts. if self.gc_interval > 0: gc.disable() + self.validation_global_step = 1 def _enable_nvidia_optimizations(self): "These optimizations are present in NVIDIA NGC PyTorch Containers" @@ -218,6 +219,16 @@ def on_train_start(self) -> None: super().on_train_start() self.init_global_step = self.trainer.global_step + def on_validation_start(self) -> None: + super().on_validation_start() + if self.gc_interval > 0: + gc.collect() + + def on_validation_end(self) -> None: + super().on_validation_end() + if self.gc_interval > 0: + gc.collect() + def _build_vocab(self): """ Manipulate vocabulary (e.g., pad vocabulary for increased performance)/ @@ -366,6 +377,14 @@ def on_train_batch_end(self, outputs, dataloader_iter: Any, batch_idx: int, unus if self.gc_interval > 0 and (self.trainer.global_step % self.gc_interval == 0): gc.collect() + def on_validation_batch_end(self, outputs, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + super().on_validation_batch_end(outputs, batch, batch_idx, dataloader_idx) + + if self.gc_interval > 0: + if self.validation_global_step % self.gc_interval == 0: + gc.collect() + self.validation_global_step += 1 + def setup_optimization( self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, ): From 2331b063d6f0282fa79c32e4e780db6cc7ef19f2 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Thu, 15 Jun 2023 15:34:30 -0600 Subject: [PATCH 27/35] update mcore version (#6875) Signed-off-by: ericharper --- README.rst | 2 +- requirements/requirements_nlp.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 841509dfe..d77b7b1fa 100644 --- a/README.rst +++ b/README.rst @@ -263,7 +263,7 @@ packaging is also needed: .. code-block:: bash - pip install -y packaging + pip install packaging Transformer Engine diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index d88280b36..0d4a5a97e 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -12,7 +12,7 @@ inflect jieba markdown2 matplotlib>=3.3.2 -megatron_core==0.1.0 +megatron_core==0.2.0 nltk>=3.6.5 numpy opencc From 9b1774e4d320532b71cca8f46edb37a179488f76 Mon Sep 17 00:00:00 2001 From: devmehendale Date: Mon, 15 Jan 2024 15:37:04 +0530 Subject: [PATCH 28/35] fix chekpoint loading error --- nemo/core/classes/modelPT.py | 2 +- nemo/utils/exp_manager.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 01cf1611f..9e2878e54 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -358,7 +358,7 @@ def save_to(self, save_path: str): def maybe_make_save_dir(path: 'pathlib.Path'): if not path.parent.exists(): - path.parent.mkdir(parents=True) + path.parent.mkdir(parents=True, exist_ok=True) save_path = Path(save_path).expanduser().resolve() app_state = AppState() diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index af3b25eb7..af9610da3 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -21,6 +21,7 @@ from dataclasses import dataclass from datetime import timedelta from pathlib import Path +import shutil from shutil import copy, move from typing import Any, Dict, List, Optional, Tuple, Union @@ -616,9 +617,12 @@ def check_resume( if fold.is_dir(): run_count += 1 new_run_dir = Path(Path(log_dir) / f"run_{run_count}") - new_run_dir.mkdir() + new_run_dir.mkdir(exist_ok=True) for _file in files_to_move: - move(str(_file), str(new_run_dir)) + try: + move(str(_file), str(new_run_dir)) + except (FileNotFoundError, shutil.Error) as e: + logging.warning(e) def check_explicit_log_dir( From 3bddb037bef0142e0c8a3b1c19f0f2ce36cfa270 Mon Sep 17 00:00:00 2001 From: kaushal-py Date: Mon, 15 Jan 2024 15:42:43 +0530 Subject: [PATCH 29/35] Add multi-softmax architecture for CTC, RNN-T and Hybrid models --- nemo/collections/asr/data/audio_to_text.py | 44 +- .../asr/data/audio_to_text_dataset.py | 5 +- .../asr/losses/ssl_losses/contrastive.py | 11 +- nemo/collections/asr/metrics/rnnt_wer.py | 15 +- nemo/collections/asr/metrics/rnnt_wer_bpe.py | 24 +- nemo/collections/asr/metrics/wer.py | 19 +- nemo/collections/asr/metrics/wer_bpe.py | 30 +- nemo/collections/asr/models/ctc_bpe_models.py | 58 +- .../asr/models/ctc_bpe_multisoftmax_models.py | 916 ++++++++++++++++++ nemo/collections/asr/models/ctc_models.py | 130 ++- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 76 +- .../asr/models/hybrid_rnnt_ctc_models.py | 69 +- .../collections/asr/models/rnnt_bpe_models.py | 30 +- nemo/collections/asr/models/rnnt_models.py | 36 +- nemo/collections/asr/modules/__init__.py | 1 + nemo/collections/asr/modules/conv_asr.py | 65 +- .../collections/asr/modules/multi_conv_asr.py | 170 ++++ nemo/collections/asr/modules/rnnt.py | 160 ++- nemo/collections/asr/parts/mixins/mixins.py | 45 + .../parts/submodules/rnnt_greedy_decoding.py | 23 +- nemo/collections/common/data/dataset.py | 1 - .../common/parts/preprocessing/collections.py | 7 +- .../collections/common/tokenizers/__init__.py | 1 + .../tokenizers/multilingual_tokenizer.py | 235 +++++ .../code_switching_manifest_creation.py | 4 +- 25 files changed, 1983 insertions(+), 192 deletions(-) create mode 100644 nemo/collections/asr/models/ctc_bpe_multisoftmax_models.py create mode 100644 nemo/collections/asr/modules/multi_conv_asr.py create mode 100644 nemo/collections/common/tokenizers/multilingual_tokenizer.py diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py index 58cd3630e..ffb56478c 100644 --- a/nemo/collections/asr/data/audio_to_text.py +++ b/nemo/collections/asr/data/audio_to_text.py @@ -59,22 +59,27 @@ def _speech_collate_fn(batch, pad_id): assumes the signals are 1d torch tensors (i.e. mono audio). """ packed_batch = list(zip(*batch)) - if len(packed_batch) == 5: + if len(packed_batch) == 6: # has language ids + _, audio_lengths, _, tokens_lengths, sample_ids, language_ids = packed_batch + elif len(packed_batch) == 5: # has sample ids + language_ids = None _, audio_lengths, _, tokens_lengths, sample_ids = packed_batch elif len(packed_batch) == 4: - sample_ids = None + sample_ids, language_ids = None, None _, audio_lengths, _, tokens_lengths = packed_batch else: - raise ValueError("Expects 4 or 5 tensors in the batch!") + raise ValueError("Expects 4 or 5 or 6 tensors in the batch!") max_audio_len = 0 has_audio = audio_lengths[0] is not None if has_audio: max_audio_len = max(audio_lengths).item() max_tokens_len = max(tokens_lengths).item() - + audio_signal, tokens = [], [] for b in batch: - if len(b) == 5: + if len(b) == 6: + sig, sig_len, tokens_i, tokens_i_len, _, _ = b + elif len(b) == 5: sig, sig_len, tokens_i, tokens_i_len, _ = b else: sig, sig_len, tokens_i, tokens_i_len = b @@ -97,12 +102,14 @@ def _speech_collate_fn(batch, pad_id): audio_signal, audio_lengths = None, None tokens = torch.stack(tokens) tokens_lengths = torch.stack(tokens_lengths) - if sample_ids is None: - return audio_signal, audio_lengths, tokens, tokens_lengths - else: + if language_ids is not None: + sample_ids = torch.tensor(sample_ids, dtype=torch.int32) + return audio_signal, audio_lengths, tokens, tokens_lengths, sample_ids, list(language_ids) + elif sample_ids is not None: sample_ids = torch.tensor(sample_ids, dtype=torch.int32) return audio_signal, audio_lengths, tokens, tokens_lengths, sample_ids - + else: + return audio_signal, audio_lengths, tokens, tokens_lengths class ASRManifestProcessor: """ @@ -424,6 +431,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'transcripts': NeuralType(('B', 'T'), LabelsType()), 'transcript_length': NeuralType(tuple('B'), LengthsType()), 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), + 'language_id': [NeuralType(('B'), StringType(), optional=True)], } def __init__( @@ -441,6 +449,7 @@ def __init__( eos_id: Optional[int] = None, pad_id: int = 0, return_sample_id: bool = False, + return_language_id: bool = False, channel_selector: Optional[ChannelSelectorType] = None, ): if type(manifest_filepath) == str: @@ -462,6 +471,7 @@ def __init__( self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) self.trim = trim self.return_sample_id = return_sample_id + self.return_language_id = return_language_id self.channel_selector = channel_selector def get_manifest_sample(self, sample_id): @@ -486,8 +496,10 @@ def __getitem__(self, index): t, tl = self.manifest_processor.process_text_by_sample(sample=sample) - if self.return_sample_id: - output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index + if self.return_language_id: + output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index, sample.lang + elif self.return_sample_id: + output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index else: output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long() @@ -530,6 +542,7 @@ class AudioToCharDataset(_AudioTextDataset): bos_id: Id of beginning of sequence symbol to append if not None eos_id: Id of end of sequence symbol to append if not None return_sample_id (bool): whether to return the sample_id as a part of each sample + return_language_id (bool): whether to return the language_id as a part of each sample channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. """ @@ -543,6 +556,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'transcripts': NeuralType(('B', 'T'), LabelsType()), 'transcript_length': NeuralType(tuple('B'), LengthsType()), 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), + 'language_id': [NeuralType(('B'), StringType(), optional=True)], } def __init__( @@ -564,6 +578,7 @@ def __init__( pad_id: int = 0, parser: Union[str, Callable] = 'en', return_sample_id: bool = False, + return_language_id: bool = False, channel_selector: Optional[ChannelSelectorType] = None, ): self.labels = labels @@ -586,6 +601,7 @@ def __init__( eos_id=eos_id, pad_id=pad_id, return_sample_id=return_sample_id, + return_language_id=return_language_id, channel_selector=channel_selector, ) @@ -624,6 +640,7 @@ class AudioToBPEDataset(_AudioTextDataset): use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS] tokens to beginning and ending of speech respectively. return_sample_id (bool): whether to return the sample_id as a part of each sample + return_language_id (bool): whether to return the language_id as a part of each sample channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. """ @@ -637,6 +654,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'transcripts': NeuralType(('B', 'T'), LabelsType()), 'transcript_length': NeuralType(tuple('B'), LengthsType()), 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), + 'language_id': [NeuralType(('B'), StringType(), optional=True)], } def __init__( @@ -652,6 +670,7 @@ def __init__( trim: bool = False, use_start_end_token: bool = True, return_sample_id: bool = False, + return_language_id: bool = False, channel_selector: Optional[ChannelSelectorType] = None, ): if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0: @@ -671,7 +690,7 @@ def __init__( class TokenizerWrapper: def __init__(self, tokenizer): - if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer): + if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer) or isinstance(tokenizer, tokenizers.multilingual_tokenizer.MultilingualTokenizer): self.is_aggregate = True else: self.is_aggregate = False @@ -701,6 +720,7 @@ def __call__(self, *args): pad_id=pad_id, trim=trim, return_sample_id=return_sample_id, + return_language_id=return_language_id, channel_selector=channel_selector, ) diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index 14e8dea19..5aabdfc7a 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -151,6 +151,7 @@ def get_char_dataset(config: dict, augmentor: Optional['AudioAugmentor'] = None) trim=config.get('trim_silence', False), parser=config.get('parser', 'en'), return_sample_id=config.get('return_sample_id', False), + return_language_id=config.get('return_language_id', False), channel_selector=config.get('channel_selector', None), ) return dataset @@ -231,6 +232,7 @@ def get_bpe_dataset( trim=config.get('trim_silence', False), use_start_end_token=config.get('use_start_end_token', True), return_sample_id=config.get('return_sample_id', False), + return_language_id=config.get('return_language_id', False), channel_selector=config.get('channel_selector', None), ) return dataset @@ -630,7 +632,8 @@ def get_audio_to_text_bpe_dataset_from_config( logging.warning(f"`concat_sampling_probabilities` need to sum to 1. Config: {config}") return None - shuffle = config['shuffle'] + if config.get('shuffle', False): + shuffle = False device = 'gpu' if torch.cuda.is_available() else 'cpu' if config.get('use_dali', False): device_id = local_rank if device == 'gpu' else None diff --git a/nemo/collections/asr/losses/ssl_losses/contrastive.py b/nemo/collections/asr/losses/ssl_losses/contrastive.py index bab691913..4ca873a17 100644 --- a/nemo/collections/asr/losses/ssl_losses/contrastive.py +++ b/nemo/collections/asr/losses/ssl_losses/contrastive.py @@ -201,8 +201,17 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non targets.transpose(0, 1), targets_masked_only.size(0), # TxBxC # T' ) else: - # only sample from masked steps in utterance negatives, _ = self.sample_negatives(targets_masked_only, targets_masked_only.size(0)) # T'xBxC # T' + # if targets_masked_only.size(0) >= self.num_negatives: + # # only sample from masked steps in utterance + # negatives, _ = self.sample_negatives(targets_masked_only, targets_masked_only.size(0)) # T'xBxC # T' + # else: # for shorter samples (<8s) + # # print(f"sampling from non-masked ({self.num_negatives},{targets_masked_only.size(0)})") + # # sample from all steps in utterance + # negatives, _ = self.sample_negatives( + # targets.transpose(0, 1), targets_masked_only.size(0), # TxBxC # T' + # ) + # NxT'xBxC out_masked_only = out_masked_only.reshape(-1, out_masked_only.shape[-1]) diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py index 1ccc2d0ac..a933124ab 100644 --- a/nemo/collections/asr/metrics/rnnt_wer.py +++ b/nemo/collections/asr/metrics/rnnt_wer.py @@ -384,6 +384,7 @@ def rnnt_decoder_predictions_tensor( encoded_lengths: torch.Tensor, return_hypotheses: bool = False, partial_hypotheses: Optional[List[Hypothesis]] = None, + lang_ids: List[str] = None, ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]: """ Decode an encoder output by autoregressive decoding of the Decoder+Joint networks. @@ -408,9 +409,10 @@ def rnnt_decoder_predictions_tensor( Look at rnnt_utils.NBestHypotheses for more information. """ # Compute hypotheses + # print("Decode strategy:", self.cfg.strategy) with torch.inference_mode(): hypotheses_list = self.decoding( - encoder_output=encoder_output, encoded_lengths=encoded_lengths, partial_hypotheses=partial_hypotheses + encoder_output=encoder_output, encoded_lengths=encoded_lengths, partial_hypotheses=partial_hypotheses, language_ids=lang_ids, ) # type: [List[Hypothesis]] # extract the hypotheses @@ -424,7 +426,7 @@ def rnnt_decoder_predictions_tensor( for nbest_hyp in prediction_list: # type: NBestHypotheses n_hyps = nbest_hyp.n_best_hypotheses # Extract all hypotheses for this sample - decoded_hyps = self.decode_hypothesis(n_hyps) # type: List[str] + decoded_hyps = self.decode_hypothesis(n_hyps, lang_ids) # type: List[str] # If computing timestamps if self.compute_timestamps is True: @@ -443,7 +445,7 @@ def rnnt_decoder_predictions_tensor( return best_hyp_text, all_hyp_text else: - hypotheses = self.decode_hypothesis(prediction_list) # type: List[str] + hypotheses = self.decode_hypothesis(prediction_list, lang_ids) # type: List[str] # If computing timestamps if self.compute_timestamps is True: @@ -462,7 +464,7 @@ def rnnt_decoder_predictions_tensor( best_hyp_text = [h.text for h in hypotheses] return best_hyp_text, None - def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hypothesis, NBestHypotheses]]: + def decode_hypothesis(self, hypotheses_list: List[Hypothesis], lang_ids: List[str] = None) -> List[Union[Hypothesis, NBestHypotheses]]: """ Decode a list of hypotheses into a list of strings. @@ -498,7 +500,10 @@ def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hyp token_repetitions = [1] * len(alignments) # preserve number of repetitions per token hypothesis = (prediction, alignments, token_repetitions) else: - hypothesis = self.decode_tokens_to_str(prediction) + if lang_ids is not None: + hypothesis = self.decode_tokens_to_str(prediction, lang_ids[ind]) + else: + hypothesis = self.decode_tokens_to_str(prediction) # TODO: remove # collapse leading spaces before . , ? for PC models diff --git a/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/nemo/collections/asr/metrics/rnnt_wer_bpe.py index 99c71daeb..5606d083d 100644 --- a/nemo/collections/asr/metrics/rnnt_wer_bpe.py +++ b/nemo/collections/asr/metrics/rnnt_wer_bpe.py @@ -195,8 +195,9 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): tokenizer: The tokenizer which will be used for decoding. """ - def __init__(self, decoding_cfg, decoder, joint, tokenizer: TokenizerSpec): - blank_id = tokenizer.tokenizer.vocab_size + def __init__(self, decoding_cfg, decoder, joint, tokenizer: TokenizerSpec, blank_id=None): + if blank_id is None: + blank_id = tokenizer.tokenizer.vocab_size self.tokenizer = tokenizer super(RNNTBPEDecoding, self).__init__( @@ -222,7 +223,7 @@ def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]: hypothesis.words, hypothesis.token_confidence, hypothesis.y_sequence ) - def decode_tokens_to_str(self, tokens: List[int]) -> str: + def decode_tokens_to_str(self, tokens: List[int], lang: str = None) -> str: """ Implemented by subclass in order to decoder a token list into a string. @@ -232,7 +233,10 @@ def decode_tokens_to_str(self, tokens: List[int]) -> str: Returns: A decoded string. """ - hypothesis = self.tokenizer.ids_to_text(tokens) + if lang is not None: + hypothesis = self.tokenizer.ids_to_text(tokens, lang) + else: + hypothesis = self.tokenizer.ids_to_text(tokens) return hypothesis def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]: @@ -275,7 +279,7 @@ def decode_ids_to_langs(self, tokens: List[int]) -> List[str]: lang_list = self.tokenizer.ids_to_text_and_langs(tokens) return lang_list - def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hypothesis, NBestHypotheses]]: + def decode_hypothesis(self, hypotheses_list: List[Hypothesis], lang_ids: List[str] = None) -> List[Union[Hypothesis, NBestHypotheses]]: """ Decode a list of hypotheses into a list of strings. Overrides the super() method optionally adding lang information @@ -286,7 +290,7 @@ def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hyp Returns: A list of strings. """ - hypotheses = super().decode_hypothesis(hypotheses_list) + hypotheses = super().decode_hypothesis(hypotheses_list, lang_ids) if self.compute_langs: if isinstance(self.tokenizer, AggregateTokenizer): for ind in range(len(hypotheses_list)): @@ -371,6 +375,7 @@ def update( encoded_lengths: torch.Tensor, targets: torch.Tensor, target_lengths: torch.Tensor, + lang_ids: List[str] = None, ) -> torch.Tensor: words = 0 scores = 0 @@ -385,10 +390,13 @@ def update( for ind in range(targets_cpu_tensor.shape[0]): tgt_len = tgt_lenths_cpu_tensor[ind].item() target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() - reference = self.decoding.decode_tokens_to_str(target) + if lang_ids is not None: + reference = self.decoding.decode_tokens_to_str(target, lang_ids[ind]) + else: + reference = self.decoding.decode_tokens_to_str(target) references.append(reference) - hypotheses, _ = self.decoding.rnnt_decoder_predictions_tensor(encoder_output, encoded_lengths) + hypotheses, _ = self.decoding.rnnt_decoder_predictions_tensor(encoder_output, encoded_lengths, lang_ids=lang_ids) if self.log_prediction: logging.info(f"\n") diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index 7f7f853d3..f1bc77f0a 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -378,6 +378,7 @@ def ctc_decoder_predictions_tensor( decoder_lengths: torch.Tensor = None, fold_consecutive: bool = True, return_hypotheses: bool = False, + lang_ids: List[str] = None, ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]: """ Decodes a sequence of labels to words @@ -432,7 +433,7 @@ def ctc_decoder_predictions_tensor( for nbest_hyp in hypotheses_list: # type: NBestHypotheses n_hyps = nbest_hyp.n_best_hypotheses # Extract all hypotheses for this sample decoded_hyps = self.decode_hypothesis( - n_hyps, fold_consecutive + n_hyps, fold_consecutive, lang_ids ) # type: List[Union[Hypothesis, NBestHypotheses]] # If computing timestamps @@ -453,7 +454,7 @@ def ctc_decoder_predictions_tensor( else: hypotheses = self.decode_hypothesis( - hypotheses_list, fold_consecutive + hypotheses_list, fold_consecutive, lang_ids ) # type: List[Union[Hypothesis, NBestHypotheses]] # If computing timestamps @@ -476,7 +477,7 @@ def ctc_decoder_predictions_tensor( return best_hyp_text, None def decode_hypothesis( - self, hypotheses_list: List[Hypothesis], fold_consecutive: bool + self, hypotheses_list: List[Hypothesis], fold_consecutive: bool, lang_ids: List[str] = None, ) -> List[Union[Hypothesis, NBestHypotheses]]: """ Decode a list of hypotheses into a list of strings. @@ -541,8 +542,11 @@ def decode_hypothesis( # in order to compute exact time stamps. hypothesis = (decoded_prediction, token_lengths, token_repetitions) else: - hypothesis = self.decode_tokens_to_str(decoded_prediction) - + if lang_ids is not None: + hypothesis = self.decode_tokens_to_str(decoded_prediction, lang_ids[ind]) + else: + hypothesis = self.decode_tokens_to_str(decoded_prediction) + # TODO: remove # collapse leading spaces before . , ? for PC models hypothesis = re.sub(r'(\s+)([\.\,\?])', r'\2', hypothesis) @@ -1026,9 +1030,10 @@ class CTCDecoding(AbstractCTCDecoding): """ def __init__( - self, decoding_cfg, vocabulary, + self, decoding_cfg, vocabulary, blank_id = None ): - blank_id = len(vocabulary) + if blank_id is None: + blank_id = len(vocabulary) self.vocabulary = vocabulary self.labels_map = dict([(i, vocabulary[i]) for i in range(len(vocabulary))]) diff --git a/nemo/collections/asr/metrics/wer_bpe.py b/nemo/collections/asr/metrics/wer_bpe.py index 762acf172..3e3ee1923 100644 --- a/nemo/collections/asr/metrics/wer_bpe.py +++ b/nemo/collections/asr/metrics/wer_bpe.py @@ -138,8 +138,10 @@ class CTCBPEDecoding(AbstractCTCDecoding): tokenizer: NeMo tokenizer object, which inherits from TokenizerSpec. """ - def __init__(self, decoding_cfg, tokenizer: TokenizerSpec): - blank_id = tokenizer.tokenizer.vocab_size + def __init__(self, decoding_cfg, tokenizer: TokenizerSpec, blank_id = None): + + if blank_id is None: + blank_id = tokenizer.tokenizer.vocab_size self.tokenizer = tokenizer super().__init__(decoding_cfg=decoding_cfg, blank_id=blank_id) @@ -175,7 +177,7 @@ def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]: self.decode_tokens_to_str(hypothesis.text[0]).split(), hypothesis.token_confidence, hypothesis.text[0] ) - def decode_tokens_to_str(self, tokens: List[int]) -> str: + def decode_tokens_to_str(self, tokens: List[int], lang: str = None) -> str: """ Implemented by subclass in order to decoder a token list into a string. @@ -185,7 +187,10 @@ def decode_tokens_to_str(self, tokens: List[int]) -> str: Returns: A decoded string. """ - hypothesis = self.tokenizer.ids_to_text(tokens) + if lang is not None: + hypothesis = self.tokenizer.ids_to_text(tokens, lang) + else: + hypothesis = self.tokenizer.ids_to_text(tokens) return hypothesis def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]: @@ -263,6 +268,7 @@ def update( predictions: torch.Tensor, targets: torch.Tensor, target_lengths: torch.Tensor, + lang_ids: List[str] = None, predictions_lengths: torch.Tensor = None, ): """ @@ -286,12 +292,20 @@ def update( for ind in range(targets_cpu_tensor.shape[0]): tgt_len = tgt_lenths_cpu_tensor[ind].item() target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() - reference = self.decoding.decode_tokens_to_str(target) + if lang_ids is not None: + reference = self.decoding.decode_tokens_to_str(target, lang_ids[ind]) + else: + reference = self.decoding.decode_tokens_to_str(target) references.append(reference) - hypotheses, _ = self.decoding.ctc_decoder_predictions_tensor( - predictions, predictions_lengths, fold_consecutive=self.fold_consecutive - ) + if lang_ids is not None: + hypotheses, _ = self.decoding.ctc_decoder_predictions_tensor( + predictions, predictions_lengths, fold_consecutive=self.fold_consecutive, lang_ids=lang_ids + ) + else: + hypotheses, _ = self.decoding.ctc_decoder_predictions_tensor( + predictions, predictions_lengths, fold_consecutive=self.fold_consecutive + ) if self.log_prediction: logging.info(f"\n") diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index a74c7f3de..14614a1b7 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -51,11 +51,11 @@ def __init__(self, cfg: DictConfig, trainer=None): # Set the new vocabulary with open_dict(cfg): # sidestepping the potential overlapping tokens issue in aggregate tokenizers - if self.tokenizer_type == "agg": + if self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual": cfg.decoder.vocabulary = ListConfig(vocabulary) else: cfg.decoder.vocabulary = ListConfig(list(vocabulary.keys())) - + # Override number of classes if placeholder provided num_classes = cfg.decoder["num_classes"] @@ -68,18 +68,33 @@ def __init__(self, cfg: DictConfig, trainer=None): cfg.decoder["num_classes"] = len(vocabulary) super().__init__(cfg=cfg, trainer=trainer) - + + # Multisoftmax + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in cfg.decoder: + logging.info("Creating masks for multi-softmax layer.") + self.language_masks = {} + for language in self.tokenizer.tokenizers_dict.keys(): + self.language_masks[language] = [(token_language == language) for _, token_language in self.tokenizer.langs_by_token_id.items()] + self.language_masks[language].append(True) # Insert blank token + self.loss = CTCLoss( + num_classes=self.decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()), + zero_infinity=True, + reduction=self._cfg.get("ctc_reduction", "mean_batch"), + ) + self.decoder.language_masks = self.language_masks + # Setup decoding objects decoding_cfg = self.cfg.get('decoding', None) - # In case decoding config not found, use default config if decoding_cfg is None: decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig) with open_dict(self.cfg): self.cfg.decoding = decoding_cfg - - self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer) - + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in cfg.decoder: + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + else: + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer) + # Setup metric with decoding strategy self._wer = WERBPE( decoding=self.decoding, @@ -105,7 +120,6 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # DALI Dataset implements dataloader interface return dataset - shuffle = config['shuffle'] if config.get('is_tarred', False): shuffle = False @@ -118,15 +132,25 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) + if config.get('shuffle', False): + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=config['shuffle'], + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + else: + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': """ diff --git a/nemo/collections/asr/models/ctc_bpe_multisoftmax_models.py b/nemo/collections/asr/models/ctc_bpe_multisoftmax_models.py new file mode 100644 index 000000000..6cd608c42 --- /dev/null +++ b/nemo/collections/asr/models/ctc_bpe_multisoftmax_models.py @@ -0,0 +1,916 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +from typing import Dict, List, Optional, Union + +import torch +from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict + +from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset +from nemo.collections.asr.losses.ctc import CTCLoss +from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig +from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType +from nemo.collections.asr.models.ctc_models import EncDecCTCModel +from nemo.collections.asr.parts.mixins import ASRBPEMixin +from nemo.core.classes.common import PretrainedModelInfo, typecheck +from nemo.utils import logging, model_utils +from nemo.core.classes.mixins import AccessMixin +from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType, StringType + +__all__ = ['EncDecCTCModelBPE'] + + +class EncDecCTCMultiSoftmaxModelBPE(EncDecCTCModel, ASRBPEMixin): + """Encoder decoder CTC-based models with Byte Pair Encoding.""" + + def __init__(self, cfg: DictConfig, trainer=None): + # Convert to Hydra 1.0 compatible DictConfig + cfg = model_utils.convert_model_config_to_dict_config(cfg) + cfg = model_utils.maybe_update_config_version(cfg) + + if 'tokenizer' not in cfg: + raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") + + # Setup the tokenizer + self._setup_tokenizer(cfg.tokenizer) + + # Initialize a dummy vocabulary + vocabulary = self.tokenizer.tokenizer.get_vocab() + + # Set the new vocabulary + with open_dict(cfg): + # sidestepping the potential overlapping tokens issue in aggregate tokenizers + if self.tokenizer_type == "agg": + cfg.decoder.vocabulary = ListConfig(vocabulary) + else: + cfg.decoder.vocabulary = ListConfig(list(vocabulary.keys())) + + # Override number of classes if placeholder provided + num_classes = cfg.decoder["num_classes"] + + if num_classes < 1: + logging.info( + "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format( + num_classes, len(vocabulary) + ) + ) + cfg.decoder["num_classes"] = len(vocabulary) + + super().__init__(cfg=cfg, trainer=trainer) + + self.loss = CTCLoss( + num_classes=self.decoder._num_classes, + zero_infinity=True, + reduction=self._cfg.get("ctc_reduction", "mean_batch"), + ) + + # # Multisoftmax + if self.tokenizer_type == "agg" and "multisoftmax" in cfg.decoder: + logging.info("Creating masks for multi-softmax layer.") + self.language_masks = {} + for language in self.tokenizer.tokenizers_dict.keys(): + self.language_masks[language] = [(token_language == language) for _, token_language in self.tokenizer.langs_by_token_id.items()] + self.language_masks[language].append(True) # Insert blank token + self.decoder.language_masks = self.language_masks + + # Setup decoding objects + decoding_cfg = self.cfg.get('decoding', None) + + # In case decoding config not found, use default config + if decoding_cfg is None: + decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig) + with open_dict(self.cfg): + self.cfg.decoding = decoding_cfg + + + self.decoding = {} + for language in self.decoder.languages: + self.decoding[language] = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer.tokenizers_dict[language]) + + self._wer_dict = {} + for language in self.decoder.languages: + self._wer_dict[language] = WERBPE( + decoding=self.decoding[language], + use_cer=self._cfg.get('use_cer', False), + dist_sync_on_step=True, + log_prediction=self._cfg.get("log_prediction", False), + ) + + def _setup_dataloader_from_config(self, config: Optional[Dict]): + dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( + config=config, + local_rank=self.local_rank, + global_rank=self.global_rank, + world_size=self.world_size, + tokenizer=self.tokenizer, + preprocessor_cfg=self.cfg.get("preprocessor", None), + ) + + if dataset is None: + return None + + if isinstance(dataset, AudioToBPEDALIDataset): + # DALI Dataset implements dataloader interface + return dataset + + if config.get('is_tarred', False): + shuffle = False + + if hasattr(dataset, 'collate_fn'): + collate_fn = dataset.collate_fn + elif hasattr(dataset.datasets[0], 'collate_fn'): + # support datasets that are lists of entries + collate_fn = dataset.datasets[0].collate_fn + else: + # support datasets that are lists of lists + collate_fn = dataset.datasets[0].datasets[0].collate_fn + + if config.get('shuffle', False): + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=shuffle, + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + else: + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + + def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': + """ + Setup function for a temporary data loader which wraps the provided audio file. + + Args: + config: A python dictionary which contains the following keys: + paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ + Recommended length per file is between 5 and 25 seconds. + batch_size: (int) batch size to use during inference. \ + Bigger will result in better throughput performance but would use more memory. + temp_dir: (str) A temporary directory where the audio manifest is temporarily + stored. + num_workers: (int) number of workers. Depends of the batch_size and machine. \ + 0 - only the main process will load batches, 1 - one worker (not main process) + + Returns: + A pytorch DataLoader for the given audio file(s). + """ + + if 'manifest_filepath' in config: + manifest_filepath = config['manifest_filepath'] + batch_size = config['batch_size'] + else: + manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') + batch_size = min(config['batch_size'], len(config['paths2audio_files'])) + + dl_config = { + 'manifest_filepath': manifest_filepath, + 'sample_rate': self.preprocessor._sample_rate, + 'batch_size': batch_size, + 'shuffle': False, + 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), + 'pin_memory': True, + 'channel_selector': config.get('channel_selector', None), + 'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False), + } + + if config.get("augmentor"): + dl_config['augmentor'] = config.get("augmentor") + + temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) + return temporary_datalayer + + # PTL-specific methods + def training_step(self, batch, batch_nb): + # Reset access registry + if AccessMixin.is_access_enabled(): + AccessMixin.reset_registry(self) + + if self.is_interctc_enabled(): + AccessMixin.set_access_enabled(access_enabled=True) + + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + language = None + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch + assert all(i == language_ids[0] for i in language_ids), f"Language ids are different for a batch -> {language_ids}" + language = language_ids[0] + + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: + log_probs, encoded_len, predictions = self.forward( + processed_signal=signal, processed_signal_length=signal_len + ) + else: + if "multisoftmax" in self.cfg.decoder: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len, language_ids=language_ids) + else: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) + + if hasattr(self, '_trainer') and self._trainer is not None: + log_every_n_steps = self._trainer.log_every_n_steps + else: + log_every_n_steps = 1 + + loss_value = self.loss( + log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len + ) + + # Add auxiliary losses, if registered + loss_value = self.add_auxiliary_losses(loss_value) + # only computing WER when requested in the logs (same as done for final-layer WER below) + loss_value, tensorboard_logs = self.add_interctc_losses( + loss_value, transcript, transcript_len, compute_wer=((batch_nb + 1) % log_every_n_steps == 0) + ) + + # Reset access registry + if AccessMixin.is_access_enabled(): + AccessMixin.reset_registry(self) + + tensorboard_logs.update( + { + 'train_loss': loss_value, + 'learning_rate': self._optimizer.param_groups[0]['lr'], + 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), + } + ) + + if (batch_nb + 1) % log_every_n_steps == 0: + self._wer_dict[language].update( + predictions=log_probs, + targets=transcript, + target_lengths=transcript_len, + predictions_lengths=encoded_len, + ) + wer, _, _ = self._wer_dict[language].compute() + self._wer_dict[language].reset() + tensorboard_logs.update({'training_batch_wer': wer}) + + return {'loss': loss_value, 'log': tensorboard_logs} + + def predict_step(self, batch, batch_idx, dataloader_idx=0): + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + language = None + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch + assert all(i == language_ids[0] for i in language_ids), f"Language ids are different for a batch -> {language_ids}" + language = language_ids[0] + + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: + log_probs, encoded_len, predictions = self.forward( + processed_signal=signal, processed_signal_length=signal_len + ) + else: + if "multisoftmax" in self.cfg.decoder: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len, language_ids=language_ids) + else: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) + + transcribed_texts, _ = self._wer_dict[language].decoding.ctc_decoder_predictions_tensor( + decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, + ) + + sample_id = sample_id.cpu().detach().numpy() + return list(zip(sample_id, transcribed_texts)) + + def validation_step(self, batch, batch_idx, dataloader_idx=0): + if self.is_interctc_enabled(): + AccessMixin.set_access_enabled(access_enabled=True) + + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + language = None + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch + assert all(i == language_ids[0] for i in language_ids), f"Language ids are different for a batch -> {language_ids}" + language = language_ids[0] + + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: + log_probs, encoded_len, predictions = self.forward( + processed_signal=signal, processed_signal_length=signal_len + ) + else: + if "multisoftmax" in self.cfg.decoder: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len, language_ids=language_ids) + else: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) + + loss_value = self.loss( + log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len + ) + loss_value, metrics = self.add_interctc_losses( + loss_value, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_", + ) + + self._wer_dict[language].update( + predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len + ) + wer, wer_num, wer_denom = self._wer_dict[language].compute() + self._wer_dict[language].reset() + metrics.update({'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom, 'val_wer': wer}) + + self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) + + # Reset access registry + if AccessMixin.is_access_enabled(): + AccessMixin.reset_registry(self) + + return metrics + + @torch.no_grad() + def transcribe( + self, + paths2audio_files: List[str], + language: str, + batch_size: int = 4, + logprobs: bool = False, + return_hypotheses: bool = False, + num_workers: int = 0, + channel_selector: Optional[ChannelSelectorType] = None, + augmentor: DictConfig = None, + verbose: bool = True, + ) -> List[str]: + """ + If modify this function, please remember update transcribe_partial_audio() in + nemo/collections/asr/parts/utils/trancribe_utils.py + + Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. + + Args: + paths2audio_files: (a list) of paths to audio files. \ + Recommended length per file is between 5 and 25 seconds. \ + But it is possible to pass a few hours long file if enough GPU memory is available. + batch_size: (int) batch size to use during inference. + Bigger will result in better throughput performance but would use more memory. + logprobs: (bool) pass True to get log probabilities instead of transcripts. + return_hypotheses: (bool) Either return hypotheses or text + With hypotheses can do some postprocessing like getting timestamp or rescoring + num_workers: (int) number of workers for DataLoader + channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. + augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. + verbose: (bool) whether to display tqdm progress bar + Returns: + A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files + """ + if paths2audio_files is None or len(paths2audio_files) == 0: + return {} + + if return_hypotheses and logprobs: + raise ValueError( + "Either `return_hypotheses` or `logprobs` can be True at any given time." + "Returned hypotheses will contain the logprobs." + ) + + if num_workers is None: + num_workers = min(batch_size, os.cpu_count() - 1) + + # We will store transcriptions here + hypotheses = [] + all_hypotheses = [] + + # Model's mode and device + mode = self.training + device = next(self.parameters()).device + dither_value = self.preprocessor.featurizer.dither + pad_to_value = self.preprocessor.featurizer.pad_to + + try: + self.preprocessor.featurizer.dither = 0.0 + self.preprocessor.featurizer.pad_to = 0 + # Switch model to evaluation mode + self.eval() + # Freeze the encoder and decoder modules + self.encoder.freeze() + self.decoder.freeze() + logging_level = logging.get_verbosity() + logging.set_verbosity(logging.WARNING) + # Work in tmp directory - will store manifest file there + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: + for audio_file in paths2audio_files: + entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} + fp.write(json.dumps(entry) + '\n') + + config = { + 'paths2audio_files': paths2audio_files, + 'batch_size': batch_size, + 'temp_dir': tmpdir, + 'num_workers': num_workers, + 'channel_selector': channel_selector, + } + + if augmentor: + config['augmentor'] = augmentor + + temporary_datalayer = self._setup_transcribe_dataloader(config) + for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): + logits, logits_len, greedy_predictions = self.forward( + input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) + ) + + if logprobs: + # dump log probs per file + for idx in range(logits.shape[0]): + lg = logits[idx][: logits_len[idx]] + hypotheses.append(lg.cpu().numpy()) + else: + current_hypotheses, all_hyp = self.decoding[language].ctc_decoder_predictions_tensor( + logits, decoder_lengths=logits_len, return_hypotheses=return_hypotheses, + ) + logits = logits.cpu() + + if return_hypotheses: + # dump log probs per file + for idx in range(logits.shape[0]): + current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] + if current_hypotheses[idx].alignments is None: + current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence + + if all_hyp is None: + hypotheses += current_hypotheses + else: + hypotheses += all_hyp + + del greedy_predictions + del logits + del test_batch + finally: + # set mode back to its original value + self.train(mode=mode) + self.preprocessor.featurizer.dither = dither_value + self.preprocessor.featurizer.pad_to = pad_to_value + if mode is True: + self.encoder.unfreeze() + self.decoder.unfreeze() + logging.set_verbosity(logging_level) + + return hypotheses + + + def change_vocabulary( + self, + new_tokenizer_dir: Union[str, DictConfig], + new_tokenizer_type: str, + decoding_cfg: Optional[DictConfig] = None, + ): + """ + Changes vocabulary of the tokenizer used during CTC decoding process. + Use this method when fine-tuning on from pre-trained model. + This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would + use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need + model to learn capitalization, punctuation and/or special characters. + + Args: + new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) + new_tokenizer_type: Either `agg`, `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, + whereas `wpe` is used for `BertTokenizer`. + new_tokenizer_cfg: A config for the new tokenizer. if provided, pre-empts the dir and type + + Returns: None + + """ + if isinstance(new_tokenizer_dir, DictConfig): + if new_tokenizer_type == 'agg': + new_tokenizer_cfg = new_tokenizer_dir + else: + raise ValueError( + f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' + ) + else: + new_tokenizer_cfg = None + + if new_tokenizer_cfg is not None: + tokenizer_cfg = new_tokenizer_cfg + else: + if not os.path.isdir(new_tokenizer_dir): + raise NotADirectoryError( + f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' + f"New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}" + ) + + if new_tokenizer_type.lower() not in ('bpe', 'wpe'): + raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') + + tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type}) + + # Setup the tokenizer + self._setup_tokenizer(tokenizer_cfg) + + # Initialize a dummy vocabulary + vocabulary = self.tokenizer.tokenizer.get_vocab() + + # Set the new vocabulary + decoder_config = copy.deepcopy(self.decoder.to_config_dict()) + # sidestepping the potential overlapping tokens issue in aggregate tokenizers + if self.tokenizer_type == "agg": + decoder_config.vocabulary = ListConfig(vocabulary) + else: + decoder_config.vocabulary = ListConfig(list(vocabulary.keys())) + + decoder_num_classes = decoder_config['num_classes'] + + # Override number of classes if placeholder provided + logging.info( + "\nReplacing old number of classes ({}) with new number of classes - {}".format( + decoder_num_classes, len(vocabulary) + ) + ) + + decoder_config['num_classes'] = len(vocabulary) + + del self.decoder + self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config) + del self.loss + self.loss = CTCLoss( + num_classes=self.decoder.num_classes_with_blank - 1, + zero_infinity=True, + reduction=self._cfg.get("ctc_reduction", "mean_batch"), + ) + + if decoding_cfg is None: + # Assume same decoding config as before + decoding_cfg = self.cfg.decoding + + # Assert the decoding config with all hyper parameters + decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) + decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) + decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) + + self.decoding = {} + for language in self.decoder.languages: + self.decoding[language] = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer.tokenizers_dict[language]) + + self._wer_dict = {} + for language in self.decoder.languages: + self._wer_dict[language] = WERBPE( + decoding=self.decoding[language], + use_cer=self._cfg.get('use_cer', False), + dist_sync_on_step=True, + log_prediction=self._cfg.get("log_prediction", False), + ) + + # Update config + with open_dict(self.cfg.decoder): + self._cfg.decoder = decoder_config + + with open_dict(self.cfg.decoding): + self._cfg.decoding = decoding_cfg + + logging.info(f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.") + + def change_decoding_strategy(self, decoding_cfg: DictConfig): + """ + Changes decoding strategy used during CTC decoding process. + + Args: + decoding_cfg: A config for the decoder, which is optional. If the decoding type + needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. + """ + if decoding_cfg is None: + # Assume same decoding config as before + logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") + decoding_cfg = self.cfg.decoding + + # Assert the decoding config with all hyper parameters + decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) + decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) + decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) + + self.decoding = {} + for language in self.decoder.languages: + self.decoding[language] = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer.tokenizers_dict[language]) + + self._wer_dict = {} + for language in self.decoder.languages: + self._wer_dict[language] = WERBPE( + decoding=self.decoding[language], + use_cer=self._cfg.get('use_cer', False), + dist_sync_on_step=True, + log_prediction=self._cfg.get("log_prediction", False), + ) + + self.decoder.temperature = decoding_cfg.get('temperature', 1.0) + + # Update config + with open_dict(self.cfg.decoding): + self.cfg.decoding = decoding_cfg + + logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") + + @classmethod + def list_available_models(cls) -> List[PretrainedModelInfo]: + """ + This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. + + Returns: + List of available pre-trained models. + """ + results = [] + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_citrinet_256", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256/versions/1.0.0rc1/files/stt_en_citrinet_256.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_citrinet_512", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512/versions/1.0.0rc1/files/stt_en_citrinet_512.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_citrinet_1024", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024/versions/1.0.0rc1/files/stt_en_citrinet_1024.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_citrinet_256_gamma_0_25", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_256_gamma_0_25.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_citrinet_512_gamma_0_25", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_512_gamma_0_25.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_citrinet_1024_gamma_0_25", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_1024_gamma_0_25.nemo", + ) + + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_es_citrinet_512", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_citrinet_512/versions/1.0.0/files/stt_es_citrinet_512.nemo", + ) + + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_de_citrinet_1024", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_citrinet_1024", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_citrinet_1024/versions/1.5.0/files/stt_de_citrinet_1024.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_fr_citrinet_1024_gamma_0_25", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_citrinet_1024_gamma_0_25", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_citrinet_1024_gamma_0_25/versions/1.5/files/stt_fr_citrinet_1024_gamma_0_25.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_fr_no_hyphen_citrinet_1024_gamma_0_25", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_citrinet_1024_gamma_0_25", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_citrinet_1024_gamma_0_25/versions/1.5/files/stt_fr_no_hyphen_citrinet_1024_gamma_0_25.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_es_citrinet_1024_gamma_0_25", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_1024_gamma_0_25", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_citrinet_1024_gamma_0_25/versions/1.8.0/files/stt_es_citrinet_1024_gamma_0_25.nemo", + ) + + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_conformer_ctc_small", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small/versions/1.6.0/files/stt_en_conformer_ctc_small.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_conformer_ctc_medium", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium/versions/1.6.0/files/stt_en_conformer_ctc_medium.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large/versions/1.10.0/files/stt_en_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_conformer_ctc_xlarge", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_xlarge", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_xlarge/versions/1.10.0/files/stt_en_conformer_ctc_xlarge.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_squeezeformer_ctc_xsmall_ls", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_xsmall_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_xsmall_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_xsmall_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_squeezeformer_ctc_small_ls", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_small_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_small_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_small_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_squeezeformer_ctc_small_medium_ls", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_small_medium_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_small_medium_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_small_medium_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_squeezeformer_ctc_medium_ls", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_medium_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_medium_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_medium_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_squeezeformer_ctc_medium_large_ls", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_medium_large_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_medium_large_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_medium_large_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_squeezeformer_ctc_large_ls", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_large_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_large_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_large_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_conformer_ctc_small_ls", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small_ls/versions/1.0.0/files/stt_en_conformer_ctc_small_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_conformer_ctc_medium_ls", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium_ls/versions/1.0.0/files/stt_en_conformer_ctc_medium_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_conformer_ctc_large_ls", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large_ls/versions/1.0.0/files/stt_en_conformer_ctc_large_ls.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_fr_conformer_ctc_large", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_ctc_large/versions/1.5.1/files/stt_fr_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_fr_no_hyphen_conformer_ctc_large", + description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_ctc_large/versions/1.5.1/files/stt_fr_no_hyphen_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_de_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_conformer_ctc_large/versions/1.5.0/files/stt_de_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_es_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_conformer_ctc_large/versions/1.8.0/files/stt_es_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_hi_conformer_ctc_medium", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hi_conformer_ctc_medium", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hi_conformer_ctc_medium/versions/1.6.0/files/stt_hi_conformer_ctc_medium.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_mr_conformer_ctc_medium", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_mr_conformer_ctc_medium", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_mr_conformer_ctc_medium/versions/1.6.0/files/stt_mr_conformer_ctc_medium.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_enes_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_ctc_large/versions/1.0.0/files/stt_enes_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_ca_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_conformer_ctc_large/versions/1.11.0/files/stt_ca_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_rw_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_rw_conformer_ctc_large/versions/1.11.0/files/stt_rw_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_enes_conformer_ctc_large_codesw", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large_codesw", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_ctc_large_codesw/versions/1.0.0/files/stt_enes_conformer_ctc_large_codesw.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_be_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_be_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_be_conformer_ctc_large/versions/1.12.0/files/stt_be_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_hr_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_conformer_ctc_large/versions/1.11.0/files/stt_hr_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_it_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_conformer_ctc_large/versions/1.13.0/files/stt_it_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_ru_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_conformer_ctc_large/versions/1.13.0/files/stt_ru_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_eo_conformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_eo_conformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_eo_conformer_ctc_large/versions/1.14.0/files/stt_eo_conformer_ctc_large.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_ctc_large", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large/versions/1.0.0/files/stt_en_fastconformer_ctc_large.nemo", + ) + results.append(model) + + return results diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 1446e1ce8..7d7b10b94 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -32,7 +32,7 @@ from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.classes.mixins import AccessMixin -from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType +from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType, StringType from nemo.utils import logging __all__ = ['EncDecCTCModel'] @@ -69,7 +69,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): cfg.decoder["num_classes"] = len(self.cfg.decoder.vocabulary) self.decoder = EncDecCTCModel.from_config_dict(self._cfg.decoder) - + self.loss = CTCLoss( num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True, @@ -99,6 +99,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): dist_sync_on_step=True, log_prediction=self._cfg.get("log_prediction", False), ) + + self.language_masks = None # Only supported for CTC_BPE models # Setup optional Optimization flags self.setup_optimization_flags() @@ -281,7 +283,7 @@ def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[Di decoding_cls = OmegaConf.structured(CTCDecodingConfig) decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - + self.decoding = CTCDecoding( decoding_cfg=decoding_cfg, vocabulary=OmegaConf.to_container(self.decoder.vocabulary) ) @@ -349,6 +351,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # Automatically inject args from model config to dataloader config audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='labels') + print("CONFIG:", config.return_language_id) dataset = audio_to_text_dataset.get_audio_to_text_char_dataset_from_config( config=config, local_rank=self.local_rank, @@ -377,16 +380,26 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - + if config.get('shuffle', False): + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=config['shuffle'], + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + else: + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): """ Sets up the training data loader via a Dict-like object. @@ -402,8 +415,8 @@ def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True + # if 'shuffle' not in train_data_config: + # train_data_config['shuffle'] = True # preserve config self._update_dataset_config(dataset_name='train', config=train_data_config) @@ -486,6 +499,7 @@ def input_types(self) -> Optional[Dict[str, NeuralType]]: "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True), + 'language_ids': [NeuralType(('B'), StringType(), optional=True)], } @property @@ -498,7 +512,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: @typecheck() def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None + self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None, language_ids=None ): """ Forward pass of the model. @@ -539,7 +553,7 @@ def forward( encoder_output = self.encoder(audio_signal=processed_signal, length=processed_signal_length) encoded = encoder_output[0] encoded_len = encoder_output[1] - log_probs = self.decoder(encoder_output=encoded) + log_probs = self.decoder(encoder_output=encoded, language_ids=language_ids) greedy_predictions = log_probs.argmax(dim=-1, keepdim=False) return ( @@ -557,19 +571,26 @@ def training_step(self, batch, batch_nb): if self.is_interctc_enabled(): AccessMixin.set_access_enabled(access_enabled=True) - signal, signal_len, transcript, transcript_len = batch + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: log_probs, encoded_len, predictions = self.forward( processed_signal=signal, processed_signal_length=signal_len ) else: - log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) + if "multisoftmax" in self.cfg.decoder: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len, language_ids=language_ids) + else: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) if hasattr(self, '_trainer') and self._trainer is not None: log_every_n_steps = self._trainer.log_every_n_steps else: log_every_n_steps = 1 - + loss_value = self.loss( log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len ) @@ -594,12 +615,21 @@ def training_step(self, batch, batch_nb): ) if (batch_nb + 1) % log_every_n_steps == 0: - self._wer.update( - predictions=log_probs, - targets=transcript, - target_lengths=transcript_len, - predictions_lengths=encoded_len, - ) + if "multisoftmax" in self.cfg.decoder: + self._wer.update( + predictions=log_probs, + targets=transcript, + target_lengths=transcript_len, + predictions_lengths=encoded_len, + lang_ids=language_ids, + ) + else: + self._wer.update( + predictions=log_probs, + targets=transcript, + target_lengths=transcript_len, + predictions_lengths=encoded_len, + ) wer, _, _ = self._wer.compute() self._wer.reset() tensorboard_logs.update({'training_batch_wer': wer}) @@ -607,17 +637,30 @@ def training_step(self, batch, batch_nb): return {'loss': loss_value, 'log': tensorboard_logs} def predict_step(self, batch, batch_idx, dataloader_idx=0): - signal, signal_len, transcript, transcript_len, sample_id = batch + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: log_probs, encoded_len, predictions = self.forward( processed_signal=signal, processed_signal_length=signal_len ) + transcribed_texts, _ = self._wer.decoding.ctc_decoder_predictions_tensor( + decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, + ) else: - log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) - - transcribed_texts, _ = self._wer.decoding.ctc_decoder_predictions_tensor( - decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, - ) + if "multisoftmax" in self.cfg.decoder: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len, language_ids=language_ids) + transcribed_texts, _ = self._wer.decoding.ctc_decoder_predictions_tensor( + decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, lang_ids=language_ids, + ) + else: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) + transcribed_texts, _ = self._wer.decoding.ctc_decoder_predictions_tensor( + decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, + ) + sample_id = sample_id.cpu().detach().numpy() return list(zip(sample_id, transcribed_texts)) @@ -626,13 +669,20 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): if self.is_interctc_enabled(): AccessMixin.set_access_enabled(access_enabled=True) - signal, signal_len, transcript, transcript_len = batch + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: log_probs, encoded_len, predictions = self.forward( processed_signal=signal, processed_signal_length=signal_len ) else: - log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) + if "multisoftmax" in self.cfg.decoder: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len, language_ids=language_ids) + else: + log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) loss_value = self.loss( log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len @@ -640,10 +690,14 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): loss_value, metrics = self.add_interctc_losses( loss_value, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_", ) - - self._wer.update( - predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len - ) + if "multisoftmax" in self.cfg.decoder: + self._wer.update( + predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len, lang_ids=language_ids, + ) + else: + self._wer.update( + predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len, + ) wer, wer_num, wer_denom = self._wer.compute() self._wer.reset() metrics.update({'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom, 'val_wer': wer}) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index b88669a1f..feafcbc30 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -73,7 +73,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): ) with open_dict(cfg): - if self.tokenizer_type == "agg": + if self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual": cfg.aux_ctc.decoder.vocabulary = ListConfig(vocabulary) else: cfg.aux_ctc.decoder.vocabulary = ListConfig(list(vocabulary.keys())) @@ -92,6 +92,40 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.decoding = RNNTBPEDecoding( decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, ) + + # Multisoftmax + self.language_masks = None + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in cfg.decoder: + logging.info("Creating masks for multi-softmax layer.") + self.language_masks = {} + self.token_id_offsets = self.tokenizer.token_id_offset + self.offset_token_ids_by_token_id = self.tokenizer.offset_token_ids_by_token_id + for language in self.tokenizer.tokenizers_dict.keys(): + self.language_masks[language] = [(token_language == language) for _, token_language in self.tokenizer.langs_by_token_id.items()] + self.language_masks[language].append(True) # Insert blank token + self.ctc_loss = CTCLoss( + num_classes=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()), + zero_infinity=True, + reduction=self.cfg.aux_ctc.get("ctc_reduction", "mean_batch"), + ) + # Setup RNNT Loss + loss_name, loss_kwargs = self.extract_rnnt_loss_cfg(self.cfg.get("loss", None)) + self.loss = RNNTLoss( + num_classes=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()), + loss_name=loss_name, + loss_kwargs=loss_kwargs, + reduction=self.cfg.get("rnnt_reduction", "mean_batch"), + ) + # Setup decoding object + self.decoding = RNNTBPEDecoding( + decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()) + ) + + self.decoder.language_masks = self.language_masks + self.joint.language_masks = self.language_masks + self.joint.token_id_offsets = self.token_id_offsets + self.joint.offset_token_ids_by_token_id = self.offset_token_ids_by_token_id + self.ctc_decoder.language_masks = self.language_masks # Setup wer object self.wer = RNNTBPEWER( @@ -113,8 +147,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): ctc_decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig) with open_dict(self.cfg.aux_ctc): self.cfg.aux_ctc.decoding = ctc_decoding_cfg - self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) - + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in cfg.decoder: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + else: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) + # Setup CTC WER self.ctc_wer = WERBPE( decoding=self.ctc_decoding, @@ -124,7 +161,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): ) # setting the RNNT decoder as the default one - self.cur_decoder = "rnnt" + # self.cur_decoder = "rnnt" + self.cur_decoder = "ctc" def _setup_dataloader_from_config(self, config: Optional[Dict]): dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( @@ -143,7 +181,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # DALI Dataset implements dataloader interface return dataset - shuffle = config['shuffle'] + # shuffle = config['shuffle'] if config.get('is_tarred', False): shuffle = False @@ -156,15 +194,25 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) + if config.get('shuffle', False): + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=config['shuffle'], + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + else: + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': """ diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py index 5ca6124ec..6f6bfcb77 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py @@ -87,6 +87,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): # setting the RNNT decoder as the default one self.cur_decoder = "rnnt" + # self.cur_decoder = "ctc" # setting up interCTC loss (from InterCTCMixin) self.setup_interctc(decoder_name='ctc_decoder', loss_name='ctc_loss', wer_name='ctc_wer') @@ -374,7 +375,11 @@ def training_step(self, batch, batch_nb): if self.is_interctc_enabled(): AccessMixin.set_access_enabled(access_enabled=True) - signal, signal_len, transcript, transcript_len = batch + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + language_ids = None + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch # forward() only performs encoder forward if isinstance(batch, DALIOutputs) and batch.has_processed_signal: @@ -401,7 +406,7 @@ def training_step(self, batch, batch_nb): # If fused Joint-Loss-WER is not used if not self.joint.fuse_loss_wer: # Compute full joint and loss - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) + joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder, language_ids=language_ids) loss_value = self.loss( log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length ) @@ -429,6 +434,7 @@ def training_step(self, batch, batch_nb): transcripts=transcript, transcript_lengths=transcript_len, compute_wer=compute_wer, + language_ids=language_ids ) # Add auxiliary losses, if registered @@ -443,7 +449,7 @@ def training_step(self, batch, batch_nb): tensorboard_logs.update({'training_batch_wer': wer}) if self.ctc_loss_weight > 0: - log_probs = self.ctc_decoder(encoder_output=encoded) + log_probs = self.ctc_decoder(encoder_output=encoded, language_ids=language_ids) ctc_loss = self.ctc_loss( log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len ) @@ -451,12 +457,21 @@ def training_step(self, batch, batch_nb): tensorboard_logs['train_ctc_loss'] = ctc_loss loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss if compute_wer: - self.ctc_wer.update( - predictions=log_probs, - targets=transcript, - target_lengths=transcript_len, - predictions_lengths=encoded_len, - ) + if "multisoftmax" in self.cfg.decoder: + self.ctc_wer.update( + predictions=log_probs, + targets=transcript, + target_lengths=transcript_len, + predictions_lengths=encoded_len, + lang_ids=language_ids, + ) + else: + self.ctc_wer.update( + predictions=log_probs, + targets=transcript, + target_lengths=transcript_len, + predictions_lengths=encoded_len, + ) ctc_wer, _, _ = self.ctc_wer.compute() self.ctc_wer.reset() tensorboard_logs.update({'training_batch_wer_ctc': ctc_wer}) @@ -486,19 +501,30 @@ def training_step(self, batch, batch_nb): def predict_step(self, batch, batch_idx, dataloader_idx=0): # TODO: add support for CTC decoding - signal, signal_len, transcript, transcript_len, sample_id = batch + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + language_ids = None + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch # forward() only performs encoder forward if isinstance(batch, DALIOutputs) and batch.has_processed_signal: encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) + best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False + ) else: encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) + if "multisoftmax" in self.cfg.decoder: + best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False, lang_ids=language_ids + ) + else: + best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False + ) del signal - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( - encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False - ) - sample_id = sample_id.cpu().detach().numpy() return list(zip(sample_id, best_hyp_text)) @@ -506,7 +532,11 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): if self.is_interctc_enabled(): AccessMixin.set_access_enabled(access_enabled=True) - signal, signal_len, transcript, transcript_len = batch + if "multisoftmax" not in self.cfg.decoder: + signal, signal_len, transcript, transcript_len = batch + language_ids=None + else: + signal, signal_len, transcript, transcript_len, sample_ids, language_ids = batch # forward() only performs encoder forward if isinstance(batch, DALIOutputs) and batch.has_processed_signal: @@ -522,14 +552,14 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): if not self.joint.fuse_loss_wer: if self.compute_eval_loss: decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) + joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder, language_ids=language_ids) loss_value = self.loss( log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length ) tensorboard_logs['val_loss'] = loss_value - self.wer.update(encoded, encoded_len, transcript, transcript_len) + self.wer.update(encoded, encoded_len, transcript, transcript_len, lang_ids=language_ids) wer, wer_num, wer_denom = self.wer.compute() self.wer.reset() @@ -555,6 +585,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): transcripts=transcript, transcript_lengths=target_len, compute_wer=compute_wer, + language_ids=language_ids ) if loss_value is not None: tensorboard_logs['val_loss'] = loss_value @@ -563,7 +594,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): tensorboard_logs['val_wer_denom'] = wer_denom tensorboard_logs['val_wer'] = wer - log_probs = self.ctc_decoder(encoder_output=encoded) + log_probs = self.ctc_decoder(encoder_output=encoded, language_ids=language_ids) if self.compute_eval_loss: ctc_loss = self.ctc_loss( log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len @@ -573,7 +604,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss tensorboard_logs['val_loss'] = loss_value self.ctc_wer.update( - predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len, + predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len, lang_ids=language_ids ) ctc_wer, ctc_wer_num, ctc_wer_denom = self.ctc_wer.compute() self.ctc_wer.reset() diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 6fed8be9d..59cb6d4fb 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -479,7 +479,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # DALI Dataset implements dataloader interface return dataset - shuffle = config['shuffle'] + # shuffle = config['shuffle'] if config.get('is_tarred', False): shuffle = False @@ -492,15 +492,25 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) + if config.get('shuffle', False): + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=config['shuffle'], + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + else: + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': """ diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index 84e086358..30d66df5e 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -482,16 +482,26 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - + if config.get('shuffle', False): + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=config['shuffle'], + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + else: + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): """ Sets up the training data loader via a Dict-like object. @@ -507,8 +517,8 @@ def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True + # if 'shuffle' not in train_data_config: + # train_data_config['shuffle'] = True # preserve config self._update_dataset_config(dataset_name='train', config=train_data_config) @@ -602,7 +612,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: @typecheck() def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None + self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None, language_ids=None ): """ Forward pass of the model. Note that for RNNT Models, the forward pass of the model is a 3 step process, diff --git a/nemo/collections/asr/modules/__init__.py b/nemo/collections/asr/modules/__init__.py index ecd430b56..e8a6f34be 100644 --- a/nemo/collections/asr/modules/__init__.py +++ b/nemo/collections/asr/modules/__init__.py @@ -34,6 +34,7 @@ ParallelConvASREncoder, SpeakerDecoder, ) +from nemo.collections.asr.modules.multi_conv_asr import MultiConvASRDecoder from nemo.collections.asr.modules.graph_decoder import ViterbiDecoderWithGraph from nemo.collections.asr.modules.hybrid_autoregressive_transducer import HATJoint from nemo.collections.asr.modules.lstm_decoder import LSTMDecoder diff --git a/nemo/collections/asr/modules/conv_asr.py b/nemo/collections/asr/modules/conv_asr.py index a05ee894f..a203dbd59 100644 --- a/nemo/collections/asr/modules/conv_asr.py +++ b/nemo/collections/asr/modules/conv_asr.py @@ -47,6 +47,7 @@ LogprobsType, NeuralType, SpectrogramType, + StringType, ) from nemo.utils import logging @@ -409,13 +410,16 @@ class ConvASRDecoder(NeuralModule, Exportable, adapter_mixins.AdapterModuleMixin @property def input_types(self): - return OrderedDict({"encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation())}) + return OrderedDict({ + "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), + 'language_ids': [NeuralType(('B'), StringType(), optional=True)], + }) @property def output_types(self): return OrderedDict({"logprobs": NeuralType(('B', 'T', 'D'), LogprobsType())}) - def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary=None): + def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary=None, multisoftmax=False, language_masks=None): super().__init__() if vocabulary is None and num_classes < 0: @@ -447,20 +451,63 @@ def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary= # to change, requires running ``model.temperature = T`` explicitly self.temperature = 1.0 + + self.multisoftmax = multisoftmax + self.language_masks = language_masks + def masked_softmax(self, x, mask=None): + """ + Performs masked softmax, as simply masking post-softmax can be + inaccurate + :param x: [batch_size, num_items] + :param mask: [batch_size, num_items] + :return: + """ + if mask is not None: + mask = mask.float() + if mask is not None: + x_masked = x * mask + (1 - 1 / mask) + else: + x_masked = x + # print(x_masked[0][0]) + x_max = x_masked.max(-1)[0] + x_exp = (x - x_max.unsqueeze(-1)).exp() + if mask is not None: + x_exp = x_exp * mask.float() + # return (x - x_max.unsqueeze(-1)) / torch.log(x_exp.sum(-1).unsqueeze(-1)) + return x_exp / x_exp.sum(-1).unsqueeze(-1) + @typecheck() - def forward(self, encoder_output): + def forward(self, encoder_output, language_ids=None): # Adapter module forward step if self.is_adapter_available(): encoder_output = encoder_output.transpose(1, 2) # [B, T, C] encoder_output = self.forward_enabled_adapters(encoder_output) encoder_output = encoder_output.transpose(1, 2) # [B, C, T] - + if self.temperature != 1.0: - return torch.nn.functional.log_softmax( - self.decoder_layers(encoder_output).transpose(1, 2) / self.temperature, dim=-1 - ) - return torch.nn.functional.log_softmax(self.decoder_layers(encoder_output).transpose(1, 2), dim=-1) + decoder_output = self.decoder_layers(encoder_output).transpose(1, 2) / self.temperature + else: + decoder_output = self.decoder_layers(encoder_output).transpose(1, 2) + + if language_ids is not None: + sample_mask = [] + for lang_idx in language_ids: + sample_mask.append(self.language_masks[lang_idx]) + sample_mask = torch.tensor(sample_mask, dtype=torch.bool) # .to(decoder_output.device) + # Repeat across timesteps [B, T, C] + sample_mask = sample_mask.unsqueeze(1) + mask = sample_mask.repeat(1, decoder_output.shape[1], 1) + # Send mask to GPU + mask = mask.to(decoder_output.device) + # masked_output = self.masked_softmax(decoder_output, mask) # B x T x 3073 -> B x T x 257 + decoder_output = torch.masked_select(decoder_output, mask).view(decoder_output.shape[0],decoder_output.shape[1],-1) + else: + masked_output = None + # print(mask[0][0]) + # softmax_output = self.masked_softmax(decoder_output, mask) + # return softmax_output + return torch.nn.functional.log_softmax(decoder_output, dim=-1) def input_example(self, max_batch=1, max_dim=256): """ @@ -582,7 +629,7 @@ def __init__( padding=kernel_size // 2, ) ) - self.decoder_layers.append(nn.Conv1d(self.feat_hidden, self.feat_hidden, kernel_size=1, bias=True)) + self.decoder_layers.append(nn.Conv1d(self.feat_hidden, self.feat_hidden, œ=1, bias=True)) self.decoder_layers.append(nn.BatchNorm1d(self.feat_hidden, eps=1e-3, momentum=0.1)) self.decoder_layers.append(activation) diff --git a/nemo/collections/asr/modules/multi_conv_asr.py b/nemo/collections/asr/modules/multi_conv_asr.py new file mode 100644 index 000000000..1f0178888 --- /dev/null +++ b/nemo/collections/asr/modules/multi_conv_asr.py @@ -0,0 +1,170 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import OrderedDict +from dataclasses import dataclass, field +from typing import List, Optional, Set, Union + +import torch +import torch.distributed +import torch.nn as nn +import torch.nn.functional as F +from omegaconf import MISSING, DictConfig, ListConfig, OmegaConf + +from nemo.collections.asr.parts.submodules.jasper import ( + JasperBlock, + MaskedConv1d, + ParallelBlock, + SqueezeExcite, + init_weights, + jasper_activations, +) +from nemo.collections.asr.parts.submodules.tdnn_attention import ( + AttentivePoolLayer, + StatsPoolLayer, + TDNNModule, + TDNNSEModule, +) +from nemo.collections.asr.parts.utils import adapter_utils +from nemo.core.classes.common import typecheck +from nemo.core.classes.exportable import Exportable +from nemo.core.classes.mixins import AccessMixin, adapter_mixins +from nemo.core.classes.module import NeuralModule +from nemo.core.neural_types import ( + AcousticEncodedRepresentation, + LengthsType, + LogitsType, + LogprobsType, + NeuralType, + SpectrogramType, + StringType, +) +from nemo.utils import logging + +__all__ = ['MultiConvASRDecoder'] + + +class MultiConvASRDecoder(NeuralModule, Exportable, adapter_mixins.AdapterModuleMixin): + """Simple ASR Decoder for use with CTC-based models such as JasperNet and QuartzNet + + Based on these papers: + https://arxiv.org/pdf/1904.03288.pdf + https://arxiv.org/pdf/1910.10261.pdf + https://arxiv.org/pdf/2005.04290.pdf + """ + + @property + def input_types(self): + return OrderedDict({ + "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), + 'language_ids': [NeuralType(('B'), StringType(), optional=True)], + }) + + @property + def output_types(self): + return OrderedDict({"logprobs": NeuralType(('B', 'T', 'D'), LogprobsType())}) + + def __init__(self, feat_in, languages, num_classes_per_lang, init_mode="xavier_uniform", num_classes=None, vocabulary=None, multisoftmax=True): + super().__init__() + + # if vocabulary is None and num_classes < 0: + # raise ValueError( + # f"Neither of the vocabulary and num_classes are set! At least one of them need to be set." + # ) + + # if num_classes <= 0: + # num_classes = len(vocabulary) + # logging.info(f"num_classes of ConvASRDecoder is set to the size of the vocabulary: {num_classes}.") + + if vocabulary is not None: + # if num_classes != len(vocabulary): + # raise ValueError( + # f"If vocabulary is specified, it's length should be equal to the num_classes. Instead got: num_classes={num_classes} and len(vocabulary)={len(vocabulary)}" + # ) + self.__vocabulary = vocabulary + self._feat_in = feat_in + # Add 1 for blank char + self._num_classes_per_lang = [] + self.languages = languages + for num_classes in num_classes_per_lang: + self._num_classes_per_lang.append(num_classes + 1) + self._num_classes = self._num_classes_per_lang[0] + + + self.decoder_layers = {} + for lang, num_classes in zip(self.languages, self._num_classes_per_lang): + self.decoder_layers[lang] = torch.nn.Sequential( + torch.nn.Conv1d(self._feat_in, num_classes, kernel_size=1, bias=True) + ) + self.decoder_layers = torch.nn.ModuleDict(self.decoder_layers) + self.apply(lambda x: init_weights(x, mode=init_mode)) + + accepted_adapters = [adapter_utils.LINEAR_ADAPTER_CLASSPATH] + self.set_accepted_adapter_types(accepted_adapters) + + # to change, requires running ``model.temperature = T`` explicitly + self.temperature = 1.0 + + @typecheck() + def forward(self, encoder_output, language_ids): + # Adapter module forward step + if self.is_adapter_available(): + encoder_output = encoder_output.transpose(1, 2) # [B, T, C] + encoder_output = self.forward_enabled_adapters(encoder_output) + encoder_output = encoder_output.transpose(1, 2) # [B, C, T] + + language = language_ids[0] + if self.temperature != 1.0: + decoder_output = self.decoder_layers[language](encoder_output).transpose(1, 2) / self.temperature + else: + decoder_output = self.decoder_layers[language](encoder_output).transpose(1, 2) + + return torch.nn.functional.log_softmax(decoder_output, dim=-1) + + def input_example(self, max_batch=1, max_dim=256): + """ + Generates input examples for tracing etc. + Returns: + A tuple of input examples. + """ + input_example = torch.randn(max_batch, self._feat_in, max_dim).to(next(self.parameters()).device) + return tuple([input_example]) + + def _prepare_for_export(self, **kwargs): + m_count = 0 + for m in self.modules(): + if type(m).__name__ == "MaskedConv1d": + m.use_mask = False + m_count += 1 + if m_count > 0: + logging.warning(f"Turned off {m_count} masked convolutions") + Exportable._prepare_for_export(self, **kwargs) + + # Adapter method overrides + def add_adapter(self, name: str, cfg: DictConfig): + # Update the config with correct input dim + cfg = self._update_adapter_cfg_input_dim(cfg) + # Add the adapter + super().add_adapter(name=name, cfg=cfg) + + def _update_adapter_cfg_input_dim(self, cfg: DictConfig): + cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self._feat_in) + return cfg + + @property + def vocabulary(self): + return self.__vocabulary + + @property + def num_classes_with_blank(self): + return self._num_classes diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py index 04bdd25ac..c34c21dd1 100644 --- a/nemo/collections/asr/modules/rnnt.py +++ b/nemo/collections/asr/modules/rnnt.py @@ -48,6 +48,7 @@ LossType, NeuralType, SpectrogramType, + StringType ) from nemo.utils import logging @@ -572,6 +573,9 @@ def __init__( normalization_mode: Optional[str] = None, random_state_sampling: bool = False, blank_as_pad: bool = True, + multisoftmax=False, + language_masks=None, + ): # Required arguments self.pred_hidden = prednet['pred_hidden'] @@ -602,6 +606,9 @@ def __init__( rnn_hidden_size=prednet.get("rnn_hidden_size", -1), ) self._rnnt_export = False + + self.multisoftmax = multisoftmax + self.language_masks = language_masks @typecheck() def forward(self, targets, target_length, states=None): @@ -976,7 +983,7 @@ def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List Returns: (tuple): decoder states for given id - ([L x (1, H)], [L x (1, H)]) + ([L x (1, H)], [L x (1, H)]s) """ if batch_states is not None: state_list = [] @@ -1130,6 +1137,7 @@ def input_types(self): "transcripts": NeuralType(('B', 'T'), LabelsType(), optional=True), "transcript_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), "compute_wer": NeuralType(optional=True), + 'language_ids': [NeuralType(('B'), StringType(), optional=True)], } @property @@ -1181,6 +1189,11 @@ def __init__( fuse_loss_wer: bool = False, fused_batch_size: Optional[int] = None, experimental_fuse_loss_wer: Any = None, + language_masks=None, + multilingual: bool = False, + language_keys: Optional[List] = None, + token_id_offsets=None, + offset_token_ids_by_token_id=None, ): super().__init__() @@ -1189,6 +1202,11 @@ def __init__( self._vocab_size = num_classes self._num_extra_outputs = num_extra_outputs self._num_classes = num_classes + 1 + num_extra_outputs # 1 is for blank + self.language_masks = language_masks + self.token_id_offsets = token_id_offsets + self.offset_token_ids_by_token_id = offset_token_ids_by_token_id + self.multilingual = multilingual + self.language_keys = language_keys if experimental_fuse_loss_wer is not None: # Override fuse_loss_wer from deprecated argument @@ -1247,6 +1265,7 @@ def forward( transcripts: Optional[torch.Tensor] = None, transcript_lengths: Optional[torch.Tensor] = None, compute_wer: bool = False, + language_ids=None, ) -> Union[torch.Tensor, List[Optional[torch.Tensor]]]: # encoder = (B, D, T) # decoder = (B, D, U) if passed, else None @@ -1262,7 +1281,24 @@ def forward( "decoder_outputs can only be None for fused step!" ) - out = self.joint(encoder_outputs, decoder_outputs) # [B, T, U, V + 1] + out = self.joint(encoder_outputs, decoder_outputs, language_ids=language_ids) # [B, T, U, V + 1] + + # if language_ids is not None: + # sample_mask = [] + # for lang_idx in language_ids: + # sample_mask.append(self.language_masks[lang_idx]) + # sample_mask = torch.tensor(sample_mask, dtype=torch.bool) # .to(decoder_output.device) + # # Repeat across timesteps [B, T, U, V + 1] + # sample_mask = sample_mask.unsqueeze(1) + # mask = sample_mask.repeat(1, out.shape[1], 1) + # sample_mask = sample_mask.unsqueeze(2) + # mask = sample_mask.repeat(1, 1, out.shape[2], 1) + # # Send mask to GPU + # mask = mask.to(out.device) + # # masked_output = self.masked_softmax(decoder_output, mask) + # # print("Before mask", sub_joint.shape) + # out = torch.masked_select(out, mask).view(out.shape[0],out.shape[1],out.shape[2],-1) + return out else: @@ -1318,8 +1354,27 @@ def forward( sub_dec = sub_dec.narrow(dim=1, start=0, length=int(max_sub_transcript_length + 1)) # Perform joint => [sub-batch, T', U', V + 1] - sub_joint = self.joint(sub_enc, sub_dec) - + if language_ids is not None: + sub_joint = self.joint(sub_enc, sub_dec, language_ids=language_ids[begin:end]) + else: + sub_joint = self.joint(sub_enc, sub_dec) + + # if language_ids is not None: + # sample_mask = [] + # for lang_idx in language_ids[begin:end]: + # sample_mask.append(self.language_masks[lang_idx]) + # sample_mask = torch.tensor(sample_mask, dtype=torch.bool) # .to(decoder_output.device) + # # Repeat across timesteps [sub-batch, T, U, V + 1] + # sample_mask = sample_mask.unsqueeze(1) + # mask = sample_mask.repeat(1, sub_joint.shape[1], 1) + # sample_mask = sample_mask.unsqueeze(2) + # mask = sample_mask.repeat(1, 1, sub_joint.shape[2], 1) + # # Send mask to GPU + # mask = mask.to(sub_joint.device) + # # masked_output = self.masked_softmax(decoder_output, mask) + # # print("Before mask", sub_joint.shape) + # sub_joint = torch.masked_select(sub_joint, mask).view(sub_joint.shape[0],sub_joint.shape[1],sub_joint.shape[2],-1) + # print("After mask", sub_joint.shape) del sub_dec # Reduce transcript length to correct alignment @@ -1357,7 +1412,10 @@ def forward( sub_transcripts = sub_transcripts.detach() # Update WER on each process without syncing - self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens) + if language_ids is not None: + self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens, lang_ids=language_ids[begin:end]) + else: + self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens) del sub_enc, sub_transcripts, sub_enc_lens, sub_transcript_lens @@ -1377,7 +1435,7 @@ def forward( return losses, wer, wer_num, wer_denom - def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor: + def joint(self, f: torch.Tensor, g: torch.Tensor, language_ids=None) -> torch.Tensor: """ Compute the joint step of the network. @@ -1422,8 +1480,20 @@ def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor: # Forward adapter modules on joint hidden if self.is_adapter_available(): inp = self.forward_enabled_adapters(inp) - - res = self.joint_net(inp) # [B, T, U, V + 1] + + # res = self.joint_net(inp) # [B, T, U, V + 1] + + if language_ids is not None: + + # Do partial forward of joint net (skipping the final linear) + for module in self.joint_net[:-1]: + inp = module(inp) # [B, T, U, H] + res_single = [] + for single_inp, lang in zip(inp, language_ids): + res_single.append(self.joint_net[-1][lang](single_inp)) + res = torch.stack(res_single) + else: + res = self.joint_net(inp) # [B, T, U, V + 1] del inp @@ -1473,11 +1543,22 @@ def _joint_net_modules(self, num_classes, pred_n_hidden, enc_n_hidden, joint_n_h elif activation == 'tanh': activation = torch.nn.Tanh() - layers = ( - [activation] - + ([torch.nn.Dropout(p=dropout)] if dropout else []) - + [torch.nn.Linear(joint_n_hidden, num_classes)] - ) + if self.multilingual: + final_layer = torch.nn.ModuleDict() + logging.info(f"Vocab size for each language: {self._vocab_size // len(self.language_keys)}") + for lang in self.language_keys: + final_layer[lang] = torch.nn.Linear(joint_n_hidden, (self._vocab_size // len(self.language_keys)+1)) + layers = ( + [activation] + + ([torch.nn.Dropout(p=dropout)] if dropout else []) + + [final_layer] + ) + else: + layers = ( + [activation] + + ([torch.nn.Dropout(p=dropout)] if dropout else []) + + [torch.nn.Linear(joint_n_hidden, num_classes)] + ) return pred, enc, torch.nn.Sequential(*layers) # Adapter method overrides @@ -1688,6 +1769,9 @@ def __init__( preserve_memory: bool = False, fuse_loss_wer: bool = False, fused_batch_size: Optional[int] = None, + language_masks=None, + token_id_offsets=None, + offset_token_ids_by_token_id=None, ): super().__init__( jointnet=jointnet, @@ -1697,6 +1781,9 @@ def __init__( preserve_memory=preserve_memory, fuse_loss_wer=fuse_loss_wer, fused_batch_size=fused_batch_size, + language_masks=language_masks, + token_id_offsets=token_id_offsets, + offset_token_ids_by_token_id=offset_token_ids_by_token_id, ) self.n_samples = n_samples self.register_buffer('blank_id', torch.tensor([self.num_classes_with_blank - 1]), persistent=False) @@ -1710,6 +1797,7 @@ def forward( transcripts: Optional[torch.Tensor] = None, transcript_lengths: Optional[torch.Tensor] = None, compute_wer: bool = False, + language_ids=None, ) -> Union[torch.Tensor, List[Optional[torch.Tensor]]]: # If in inference mode, revert to basic RNNT Joint behaviour. # Sampled RNNT is only used for training. @@ -1722,6 +1810,7 @@ def forward( transcripts=transcripts, transcript_lengths=transcript_lengths, compute_wer=compute_wer, + language_ids=language_ids, ) if transcripts is None or transcript_lengths is None: @@ -1799,7 +1888,7 @@ def forward( # Perform sampled joint => [sub-batch, T', U', {V' < V} + 1}] sub_joint, sub_transcripts_remapped = self.sampled_joint( - sub_enc, sub_dec, transcript=sub_transcripts, transcript_lengths=sub_transcript_lens + sub_enc, sub_dec, transcript=sub_transcripts, transcript_lengths=sub_transcript_lens, language_ids=language_ids[begin:end], ) del sub_dec @@ -1842,7 +1931,10 @@ def forward( sub_transcripts = sub_transcripts.detach() # Update WER on each process without syncing - self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens) + if language_ids is not None: + self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens, lang_ids=language_ids[begin:end]) + else: + self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens) del sub_enc, sub_transcripts, sub_enc_lens, sub_transcript_lens @@ -1863,7 +1955,7 @@ def forward( return losses, wer, wer_num, wer_denom def sampled_joint( - self, f: torch.Tensor, g: torch.Tensor, transcript: torch.Tensor, transcript_lengths: torch.Tensor, + self, f: torch.Tensor, g: torch.Tensor, transcript: torch.Tensor, transcript_lengths: torch.Tensor, language_ids=None, ) -> torch.Tensor: """ Compute the sampled joint step of the network. @@ -1932,7 +2024,19 @@ def sampled_joint( # Begin compute of sampled RNNT joint with torch.no_grad(): - # gather true labels + + if language_ids is not None: + transcript_with_offset = [] + for t, lang in zip(transcript, language_ids): + offset_transcript = [] + for t_token in t.tolist(): + if t_token != 0: + offset_transcript.append(t_token+self.token_id_offsets[lang]) + else: + offset_transcript.append(0) + transcript_with_offset.append(offset_transcript) + transcript = torch.tensor(transcript_with_offset, dtype=transcript.dtype, device=transcript.device) + transcript_vocab_ids = torch.unique(transcript) # augment with blank token id @@ -1966,6 +2070,10 @@ def sampled_joint( # new_transcript = [1, 0, 2, 3, 2, 0] index = torch.bucketize(transcript.ravel(), palette) transcript = key[index].reshape(transcript.shape) + if language_ids is not None: + # remap to original transcript ids which are without offsets for multi-softmax + new_transcript = [[self.offset_token_ids_by_token_id[idx.item()] for idx in t] for t in transcript] + transcript = torch.tensor(new_transcript, dtype=transcript.dtype) transcript = transcript.to(t_device) # Extract out partial weight tensor and bias tensor of just the V_Pos vocabulary from the full joint. @@ -2027,7 +2135,25 @@ def sampled_joint( # Finally, construct the sampled joint as the V_Sampled = Union(V_Pos, V_Neg) # Here, we simply concatenate the two tensors to construct the joint with V_Sampled vocab # because before we have properly asserted that Intersection(V_Pos, V_Neg) is a null set. + # print(transcript_scores.shape, noise_scores.shape) res = torch.cat([transcript_scores, noise_scores], dim=-1) + + # Multisoftmax language-wise sampling of output + if language_ids is not None: + sample_mask = [] + sampled_vocab = transcript_vocab_ids.tolist() + accept_samples.tolist() + for lang_idx in language_ids: + sample_mask.append([self.language_masks[lang_idx][v] for v in sampled_vocab]) + sample_mask = torch.tensor(sample_mask, dtype=torch.bool) # .to(decoder_output.device) + # Repeat across timesteps [B, T, U, V + 1] + sample_mask = sample_mask.unsqueeze(1) + sample_mask = sample_mask.repeat(1, res.shape[1], 1) + sample_mask = sample_mask.unsqueeze(2) + mask = sample_mask.repeat(1, 1, res.shape[2], 1) + # Send mask to GPU + mask = mask.to(res.device) + print(res.shape, mask.shape) + res = torch.masked_select(res, mask).view(res.shape[0],res.shape[1],res.shape[2],-1) del inp diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index eba896d04..e97c92d04 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -57,6 +57,8 @@ def _setup_tokenizer(self, tokenizer_cfg: DictConfig): raise ValueError("`tokenizer.type` cannot be None") elif tokenizer_type.lower() == 'agg': self._setup_aggregate_tokenizer(tokenizer_cfg) + elif tokenizer_type.lower() == 'multilingual': + self._setup_multilingual_tokenizer(tokenizer_cfg) else: self._setup_monolingual_tokenizer(tokenizer_cfg) @@ -215,6 +217,49 @@ def _setup_aggregate_tokenizer(self, tokenizer_cfg: DictConfig): ][lang]['type'] self.tokenizer = tokenizers.AggregateTokenizer(tokenizers_dict) + + def _setup_multilingual_tokenizer(self, tokenizer_cfg: DictConfig): + # Prevent tokenizer parallelism (unless user has explicitly set it) + if 'TOKENIZERS_PARALLELISM' not in os.environ: + os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + self.tokenizer_cfg = OmegaConf.to_container(tokenizer_cfg, resolve=True) # type: dict + + # the aggregate tokenizer does not have one tokenizer_dir but multiple ones + self.tokenizer_dir = None + + self.tokenizer_cfg.pop('dir', None) # Remove tokenizer directory, if any + # Remove tokenizer_type -- obviously if we are here, the type is 'agg' + self.tokenizer_type = self.tokenizer_cfg.pop('type').lower() + + # the aggregate tokenizer should not have these + self.hf_tokenizer_kwargs = {} + self.tokenizer_cfg.pop("hf_kwargs", {}) # Remove HF tokenizer kwargs, if any + + logging.info('_setup_tokenizer: detected an aggregate tokenizer') + # need to de-register any monolingual config items if they exist + self._cleanup_monolingual_and_aggregate_config_and_artifacts_if_needed() + + # overwrite tokenizer type + if hasattr(self, 'cfg') and 'tokenizer' in self.cfg: + self.cfg.tokenizer.type = self.tokenizer_type + + tokenizers_dict = {} + # init each of the monolingual tokenizers found in the config and assemble into AggregateTokenizer + for lang, tokenizer_config in self.tokenizer_cfg[self.AGGREGATE_TOKENIZERS_DICT_PREFIX].items(): + (tokenizer, model_path, vocab_path, spe_vocab_path,) = self._make_tokenizer(tokenizer_config, lang) + + tokenizers_dict[lang] = tokenizer + if hasattr(self, 'cfg'): + with open_dict(self.cfg.tokenizer): + self.cfg.tokenizer[self.AGGREGATE_TOKENIZERS_DICT_PREFIX][lang]['dir'] = self.tokenizer_cfg[ + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + ][lang]['dir'] + self.cfg.tokenizer[self.AGGREGATE_TOKENIZERS_DICT_PREFIX][lang]['type'] = self.tokenizer_cfg[ + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + ][lang]['type'] + + self.tokenizer = tokenizers.MultilingualTokenizer(tokenizers_dict) def _make_tokenizer(self, tokenizer_cfg: DictConfig, lang=None): diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py index 5e98b03f2..4b055bc76 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py @@ -38,7 +38,7 @@ from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMeasureMixin, ConfidenceMethodConfig from nemo.collections.common.parts.rnn import label_collate from nemo.core.classes import Typing, typecheck -from nemo.core.neural_types import AcousticEncodedRepresentation, ElementType, HypothesisType, LengthsType, NeuralType +from nemo.core.neural_types import AcousticEncodedRepresentation, ElementType, HypothesisType, LengthsType, NeuralType, StringType from nemo.utils import logging @@ -140,6 +140,7 @@ def input_types(self): "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), "encoded_lengths": NeuralType(tuple('B'), LengthsType()), "partial_hypotheses": [NeuralType(elements_type=HypothesisType(), optional=True)], # must always be last + "language_ids": [NeuralType(('B'), StringType(), optional=True)], # must always be last } @property @@ -213,7 +214,7 @@ def _pred_step( # output: [B, 1, K] return self.decoder.predict(label, hidden, add_sos=add_sos, batch_size=batch_size) - def _joint_step(self, enc, pred, log_normalize: Optional[bool] = None): + def _joint_step(self, enc, pred, log_normalize: Optional[bool] = None, language_ids=None): """ Common joint step based on AbstractRNNTJoint implementation. @@ -226,8 +227,13 @@ def _joint_step(self, enc, pred, log_normalize: Optional[bool] = None): logits of shape (B, T=1, U=1, V + 1) """ with torch.no_grad(): - logits = self.joint.joint(enc, pred) - + ## Old + # logits = self.joint.joint(enc, pred) + ## New for multisoftmax + self.joint._fuse_loss_wer = False + logits = self.joint(encoder_outputs=enc.transpose(1, 2), decoder_outputs=pred.transpose(1, 2), language_ids=language_ids) + self.joint._fuse_loss_wer = True + if log_normalize is None: if not logits.is_cuda: # Use log softmax only if on CPU logits = logits.log_softmax(dim=len(logits.shape) - 1) @@ -569,6 +575,7 @@ def forward( encoder_output: torch.Tensor, encoded_lengths: torch.Tensor, partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, + language_ids=None, ): """Returns a list of hypotheses given an input batch of the encoder hidden embedding. Output token is generated auto-repressively. @@ -596,7 +603,7 @@ def forward( with self.decoder.as_frozen(), self.joint.as_frozen(): inseq = encoder_output # [B, T, D] hypotheses = self._greedy_decode( - inseq, logitlen, device=inseq.device, partial_hypotheses=partial_hypotheses + inseq, logitlen, device=inseq.device, partial_hypotheses=partial_hypotheses, language_ids=language_ids ) # Pack the hypotheses results @@ -613,6 +620,7 @@ def _greedy_decode_blank_as_pad( out_len: torch.Tensor, device: torch.device, partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, + language_ids=None, ): if partial_hypotheses is not None: raise NotImplementedError("`partial_hypotheses` support is not supported") @@ -682,7 +690,7 @@ def _greedy_decode_blank_as_pad( # Batched joint step - Output = [B, V + 1] # If preserving per-frame confidence, log_normalize must be true - logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None)[ + logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None, language_ids=language_ids)[ :, 0, 0, : ] @@ -818,6 +826,7 @@ def _greedy_decode_masked( out_len: torch.Tensor, device: torch.device, partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, + language_ids=None, ): if partial_hypotheses is not None: raise NotImplementedError("`partial_hypotheses` support is not supported") @@ -898,7 +907,7 @@ def _greedy_decode_masked( # Batched joint step - Output = [B, V + 1] # If preserving per-frame confidence, log_normalize must be true - logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None)[ + logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None, language_ids=language_ids)[ :, 0, 0, : ] diff --git a/nemo/collections/common/data/dataset.py b/nemo/collections/common/data/dataset.py index 030e99780..97c7fdd77 100644 --- a/nemo/collections/common/data/dataset.py +++ b/nemo/collections/common/data/dataset.py @@ -63,7 +63,6 @@ def __init__( self.world_size = world_size self.sampling_kwargs = {} self.sampling_scale = sampling_scale - if sampling_technique == 'temperature': self.index_generator = ConcatDataset.temperature_generator self.sampling_kwargs['temperature'] = sampling_temperature diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py index 4616f95e1..5c3c35990 100644 --- a/nemo/collections/common/parts/preprocessing/collections.py +++ b/nemo/collections/common/parts/preprocessing/collections.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from tqdm import tqdm import collections import json import os @@ -138,9 +139,9 @@ def __init__( if index_by_file_id: self.mapping = {} - for id_, audio_file, duration, offset, text, speaker, orig_sr, token_labels, lang in zip( + for id_, audio_file, duration, offset, text, speaker, orig_sr, token_labels, lang in tqdm(zip( ids, audio_files, durations, offsets, texts, speakers, orig_sampling_rates, token_labels, langs - ): + )): # Duration filters. if min_duration is not None and duration < min_duration: duration_filtered += duration @@ -217,7 +218,7 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs): [], ) speakers, orig_srs, token_labels, langs = [], [], [], [] - for item in manifest.item_iter(manifests_files): + for item in tqdm(manifest.item_iter(manifests_files)): ids.append(item['id']) audio_files.append(item['audio_file']) durations.append(item['duration']) diff --git a/nemo/collections/common/tokenizers/__init__.py b/nemo/collections/common/tokenizers/__init__.py index f46e3b150..57ff9fae2 100644 --- a/nemo/collections/common/tokenizers/__init__.py +++ b/nemo/collections/common/tokenizers/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from nemo.collections.common.tokenizers.aggregate_tokenizer import AggregateTokenizer +from nemo.collections.common.tokenizers.multilingual_tokenizer import MultilingualTokenizer from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer diff --git a/nemo/collections/common/tokenizers/multilingual_tokenizer.py b/nemo/collections/common/tokenizers/multilingual_tokenizer.py new file mode 100644 index 000000000..a26f9230e --- /dev/null +++ b/nemo/collections/common/tokenizers/multilingual_tokenizer.py @@ -0,0 +1,235 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Union + +import numpy as np + +from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec +from nemo.utils import logging + +__all__ = ['AggregateTokenizer'] + + +class DummyTokenizer: + def __init__(self, vocab): + self.vocab = vocab + self.vocab_size = len(vocab) + + # minimum compatibility + # since all the monolingual tokenizers have a vocab + # additional methods could be added here + def get_vocab(self): + return self.vocab + + +class MultilingualTokenizer(TokenizerSpec): + ''' + MultilingualTokenizer, allowing one to combine multiple regular monolongual tokenizers into one tokenizer. + The intuition is that we can use existing tokenizers "as is", without retraining, and associate each tokenizer with a language id + during text processing (language id will be used to route the incoming text sample to the right tokenizer) + as well as a token id range for detokenization (e.g. [0..127] for tokenizer A, [128..255] for tokenizer B) so + that the orignal text could be reconstructed. Note that we assume that the incoming dict of langs / tokenizers + is ordered, e.g. the first tokenizer will be assigned a lower interval of token ids + Args: + tokenizers: dict of tokenizers, keys are lang ids, values are actual tokenizers + ''' + + def __init__(self, tokenizers: Dict): + + self.tokenizers_dict = tokenizers + self.vocabulary = [] + + # the tokenizers should produce non-overlapping, ordered token ids + # keys are language ids + self.token_id_offset = {} + + # keys are tokenizer numbers + self.token_id_offset_by_tokenizer_num = {} + offset = 0 + i = 0 + for lang, tokenizer in self.tokenizers_dict.items(): + self.token_id_offset[lang] = offset + self.token_id_offset_by_tokenizer_num[i] = offset + offset += len(tokenizer.vocab) + i += 1 + + for tokenizer in self.tokenizers_dict.values(): + self.vocabulary.extend(tokenizer.vocab) + + self.vocab_size = len(self.vocabulary) + logging.info(f'Aggregate vocab size: {self.vocab_size}') + + # for compatibility purposes only -- right now only the get_vocab method + # is supported, returning the joint vocab across all tokenizers + self.tokenizer = DummyTokenizer(self.vocabulary) + + # lookup tables to speed up token to text operations + # if there are two tokenizers, [0,1], ['en', 'es'], each with 128 tokens, the aggregate tokenizer + # token range will be [0,255]. The below method provides three look up tables: + # one, to convert the incoming token id -- e.g. 200 into its real id (200-127 = 73) + # second, to compute the tokenizer id that should process that token (1) + # third, the compute the lang id for that token ('es') + offset_token_ids_by_token_id, tokenizers_by_token_id, langs_by_token_id = self._calculate_offsets() + + self.offset_token_ids_by_token_id = offset_token_ids_by_token_id + self.tokenizers_by_token_id = tokenizers_by_token_id + self.langs_by_token_id = langs_by_token_id + + def _calculate_offsets(self): + offsets = {} + tokenizers = {} + langs = {} + cur_num = 0 + tot = len(self.tokenizers_dict) + for id in range(len(self.vocabulary)): + off_id = id - list(self.token_id_offset.values())[cur_num] + if cur_num + 1 < tot: + if id >= list(self.token_id_offset.values())[cur_num + 1]: + cur_num += 1 + off_id = id - list(self.token_id_offset.values())[cur_num] + offsets[id] = off_id + tokenizers[id] = list(self.tokenizers_dict.values())[cur_num] + langs[id] = list(self.tokenizers_dict.keys())[cur_num] + + return offsets, tokenizers, langs + + def text_to_tokens(self, text, lang_id): + tokenizer = self.tokenizers_dict[lang_id] + return tokenizer.text_to_tokens(text) + + def text_to_ids(self, text, lang_id): + tokenizer = self.tokenizers_dict[lang_id] + token_ids = tokenizer.text_to_ids(text) + # token_ids[:] = [t + self.token_id_offset[lang_id] for t in token_ids] + + return token_ids + + def tokens_to_text(self, tokens, lang_id): + if isinstance(tokens, np.ndarray): + tokens = tokens.tolist() + + tokenizer = self.tokenizers_dict[lang_id] + return tokenizer.decode_pieces(tokens) + + def ids_to_text(self, ids, lang): + if isinstance(ids, np.ndarray): + ids = ids.tolist() + + tokens = [] + for id in ids: + # offset_id = self.offset_token_ids_by_token_id[id] + # tokenizer = self.tokenizers_by_token_id[id] + tokenizer = self.tokenizers_dict[lang] + # tokens.extend(tokenizer.ids_to_tokens([offset_id])) + tokens.extend(tokenizer.ids_to_tokens([id])) + text = ''.join(tokens).replace('▁', ' ') + + return text + + def token_to_id(self, token, lang_id): + tokenizer = self.tokenizers_dict[lang_id] + return tokenizer.token_to_id(token) + self.token_id_offset[lang_id] + + def ids_to_tokens(self, ids): + tokens = [] + + for id in ids: + offset_id = self.offset_token_ids_by_token_id[id] + tokenizer = self.tokenizers_by_token_id[id] + token = tokenizer.ids_to_tokens([offset_id])[0] + tokens.append(token) + + return tokens + + def ids_to_text_and_langs(self, ids): + text_and_langs = [] + + for id in ids: + offset_id = self.offset_token_ids_by_token_id[id] + tokenizer = self.tokenizers_by_token_id[id] + token = tokenizer.ids_to_tokens([offset_id])[0] + text = token.replace('▁', ' ') + text = text.strip() # strip for display purposes + lang = self.langs_by_token_id[id] + text_and_langs.append({'char': text, 'lang': lang}) + + return text_and_langs + + def ids_to_words_and_langs(self, ids): + words_and_langs = [] + + word_ids = [] # tokens belonging to the current word + for id in ids: + offset_id = self.offset_token_ids_by_token_id[id] + tokenizer = self.tokenizers_by_token_id[id] + token = tokenizer.ids_to_tokens([offset_id])[0] + if token.startswith('▁'): + if len(word_ids) > 0: # if this isn't the first word + word = self.ids_to_text(word_ids) + word = word.strip() # strip for display purposes + lang = self.ids_to_lang(word_ids) + wl = {'word': word, 'lang': lang} + words_and_langs.append(wl) + word_ids = [] + word_ids.append(id) + + if len(word_ids) > 0: # the last tokens + word = self.ids_to_text(word_ids) + word = word.strip() # strip for display purposes + lang = self.ids_to_lang(word_ids) + wl = {'word': word, 'lang': lang} + words_and_langs.append(wl) + + return words_and_langs + + def ids_to_lang(self, ids): + lang_cnts = {} + + for id in ids: + lang = self.langs_by_token_id[id] + lang_cnt = lang_cnts.get(lang) + if lang_cnt is not None: + lang_cnts[lang] = lang_cnt + 1 + else: + lang_cnts[lang] = 1 + + max_lang = '' + max_lang_cnt = -1 + for lang, lang_cnt in lang_cnts.items(): + if lang_cnt > max_lang_cnt: + max_lang = lang + max_lang_cnt = lang_cnt + + return max_lang + + def tokens_to_ids(self, tokens: Union[str, List[str]], langs: Union[str, List[str]]) -> Union[int, List[int]]: + if isinstance(tokens, str): + tokens = [tokens] + if isinstance(langs, str): + langs = [langs] + + ids = [] + for i, token in enumerate(tokens): + lang_id = langs[i] + ids.append(self.token_to_id(token, lang_id)) + return ids + + @property + def vocab(self): + return self.vocabulary + + @property + def langs(self): + return list(self.tokenizers_dict.keys()) diff --git a/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py b/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py index c783f803a..1f282230c 100644 --- a/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py +++ b/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py @@ -36,8 +36,8 @@ parser.add_argument( "--id_language2", default=None, type=str, help='Identifier for language 2, eg: en, es, hi', required=True ) -parser.add_argument("--max_sample_duration_sec", default=19, type=int, help='Maximum duration of sample (sec)') -parser.add_argument("--min_sample_duration_sec", default=16, type=int, help='Minimum duration of sample (sec)') +parser.add_argument("--max_sample_duration_sec", default=30, type=int, help='Maximum duration of sample (sec)') +parser.add_argument("--min_sample_duration_sec", default=20, type=int, help='Minimum duration of sample (sec)') parser.add_argument("--dataset_size_required_hrs", default=1, type=int, help='Duration of dataset required (hrs)') args = parser.parse_args() From 1ed6d46eb05d0994fb894fd5439d4c04a4c4a775 Mon Sep 17 00:00:00 2001 From: ASR Date: Wed, 24 Jan 2024 10:44:35 +0530 Subject: [PATCH 30/35] added some things --- nemo/collections/asr/metrics/wer_bpe.py | 16 +++++-- nemo/collections/asr/models/ctc_bpe_models.py | 11 +++-- .../asr/models/ctc_bpe_multisoftmax_models.py | 2 + .../common/parts/preprocessing/collections.py | 1 + nemo/utils/exp_manager.py | 45 +++++++++++++++++-- 5 files changed, 65 insertions(+), 10 deletions(-) diff --git a/nemo/collections/asr/metrics/wer_bpe.py b/nemo/collections/asr/metrics/wer_bpe.py index 3e3ee1923..93ad0de75 100644 --- a/nemo/collections/asr/metrics/wer_bpe.py +++ b/nemo/collections/asr/metrics/wer_bpe.py @@ -138,7 +138,7 @@ class CTCBPEDecoding(AbstractCTCDecoding): tokenizer: NeMo tokenizer object, which inherits from TokenizerSpec. """ - def __init__(self, decoding_cfg, tokenizer: TokenizerSpec, blank_id = None): + def __init__(self, decoding_cfg, tokenizer: TokenizerSpec, blank_id = None,lang=None): if blank_id is None: blank_id = tokenizer.tokenizer.vocab_size @@ -149,13 +149,21 @@ def __init__(self, decoding_cfg, tokenizer: TokenizerSpec, blank_id = None): # Finalize Beam Search Decoding framework if isinstance(self.decoding, ctc_beam_decoding.AbstractBeamCTCInfer): if hasattr(self.tokenizer.tokenizer, 'get_vocab'): - vocab_dict = self.tokenizer.tokenizer.get_vocab() - if isinstance(self.tokenizer.tokenizer, DummyTokenizer): # AggregateTokenizer.DummyTokenizer + if lang is None: + vocab_dict = self.tokenizer.tokenizer.get_vocab() + else: + vocab_dict = self.tokenizer.tokenizers_dict['hi'].tokenizer.get_vocab() + print(vocab_dict) + # breakpoint() + if isinstance(self.tokenizer.tokenizer, DummyTokenizer): # or decoding_cfg.tokenizer_type == "multilingual": # AggregateTokenizer.DummyTokenizer vocab = vocab_dict else: vocab = list(vocab_dict.keys()) self.decoding.set_vocabulary(vocab) - self.decoding.set_tokenizer(tokenizer) + if lang is not None: + self.decoding.set_tokenizer(self.tokenizer.tokenizers_dict['hi']) + else: + self.decoding.set_tokenizer(self.tokenizer) else: logging.warning("Could not resolve the vocabulary of the tokenizer !") diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index 14614a1b7..a9577f2bd 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -91,7 +91,12 @@ def __init__(self, cfg: DictConfig, trainer=None): with open_dict(self.cfg): self.cfg.decoding = decoding_cfg if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in cfg.decoder: - self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + if decoding_cfg.strategy == 'pyctcdecode': + # create separate decoders for each language + # self.decoding = [CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys()),lang=l) for l in self.tokenizer.tokenizers_dict.keys()] + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys()),lang='any') + else: + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) else: self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer) @@ -140,7 +145,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): drop_last=config.get('drop_last', False), shuffle=config['shuffle'], num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), + pin_memory=config.get('pin_memory', False) ) else: return torch.utils.data.DataLoader( @@ -149,7 +154,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): collate_fn=collate_fn, drop_last=config.get('drop_last', False), num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), + pin_memory=config.get('pin_memory', False) ) def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': diff --git a/nemo/collections/asr/models/ctc_bpe_multisoftmax_models.py b/nemo/collections/asr/models/ctc_bpe_multisoftmax_models.py index 6cd608c42..bc53ad7fd 100644 --- a/nemo/collections/asr/models/ctc_bpe_multisoftmax_models.py +++ b/nemo/collections/asr/models/ctc_bpe_multisoftmax_models.py @@ -50,6 +50,8 @@ def __init__(self, cfg: DictConfig, trainer=None): # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() + print(vocabulary) + breakpoint() # Set the new vocabulary with open_dict(cfg): diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py index 5c3c35990..3abb14c16 100644 --- a/nemo/collections/common/parts/preprocessing/collections.py +++ b/nemo/collections/common/parts/preprocessing/collections.py @@ -161,6 +161,7 @@ def __init__( if lang is not None: text_tokens = parser(text, lang) else: + print(audio_file) raise ValueError("lang required in manifest when using aggregate tokenizers") else: text_tokens = parser(text) diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index af9610da3..6e0a08653 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -50,6 +50,40 @@ from nemo.utils.model_utils import uninject_model_parallel_rank +class MinStepsCallback(EarlyStopping): + def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: int = 3, + verbose: bool = False, mode: str = 'auto', strict: bool = True, + min_steps: int = 5000, check_finite: bool = True, stopping_threshold: Optional[float] = None, + divergence_threshold: Optional[float] = None,check_on_train_epoch_end: Optional[bool] = None, + log_rank_zero_only: bool = False + ): + self.min_steps = min_steps + super().__init__(monitor=monitor, min_delta=min_delta, patience=patience, + verbose=verbose, mode=mode, strict=strict, check_finite=check_finite, + stopping_threshold=stopping_threshold,divergence_threshold=divergence_threshold, + check_on_train_epoch_end=check_on_train_epoch_end,log_rank_zero_only=log_rank_zero_only) + + def _run_early_stopping_check(self, trainer: pytorch_lightning.Trainer) -> None: + if trainer.global_step > self.min_steps: + return super()._run_early_stopping_check(trainer) + else: + return False, f"Yet to reach the minimum steps {trainer.global_step}" + +@dataclass +class MinStepsCallbackParams: + monitor: str = "val_loss" # The metric that early stopping should consider. + mode: str = "min" # inform early stopping whether to look for increase or decrease in monitor. + min_delta: float = 0.001 # smallest change to consider as improvement. + patience: int = 10 # how many (continuous) validation cycles to wait with no improvement and stopping training. + verbose: bool = True + strict: bool = True + check_finite: bool = True + stopping_threshold: Optional[float] = None + divergence_threshold: Optional[float] = None + check_on_train_epoch_end: Optional[bool] = None + log_rank_zero_only: bool = False + min_steps: int = 5000 + class NotFoundError(NeMoBaseException): """ Raised when a file or folder is not found""" @@ -170,6 +204,8 @@ class ExpManagerConfig: ema: Optional[EMAParams] = EMAParams() # Wall clock time limit max_time_per_run: Optional[str] = None + early_stopping_with_min_steps: Optional[bool] = False + early_stopping_with_min_steps_params: Optional[MinStepsCallbackParams] = MinStepsCallbackParams() class TimingCallback(Callback): @@ -436,10 +472,13 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo every_n_steps=cfg.ema.every_n_steps, ) trainer.callbacks.append(ema_callback) + if cfg.early_stopping_with_min_steps: + min_steps_cb = MinStepsCallback(**cfg.early_stopping_with_min_steps_params) + trainer.callbacks.append(min_steps_cb) - if cfg.create_early_stopping_callback: - early_stop_callback = EarlyStopping(**cfg.early_stopping_callback_params) - trainer.callbacks.append(early_stop_callback) + # if cfg.create_early_stopping_callback: + # early_stop_callback = EarlyStopping(**cfg.early_stopping_callback_params) + # trainer.callbacks.append(early_stop_callback) if cfg.create_checkpoint_callback: configure_checkpointing( From c6cb1c859fd61e0d80a1f8abf171f19ef886e2e8 Mon Sep 17 00:00:00 2001 From: kaushal-py Date: Mon, 5 Feb 2024 17:08:55 +0530 Subject: [PATCH 31/35] Fix inference bug --- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 32 ++++++++++++++----- .../asr/models/hybrid_rnnt_ctc_models.py | 16 ++++++++-- nemo/collections/asr/models/rnnt_models.py | 11 +++++-- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index feafcbc30..fb72bef41 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -338,9 +338,14 @@ def change_vocabulary( decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + self.decoding = RNNTBPEDecoding( + decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()) + ) + else: + self.decoding = RNNTBPEDecoding( + decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, + ) self.wer = RNNTBPEWER( decoding=self.decoding, @@ -405,7 +410,10 @@ def change_vocabulary( ctc_decoding_cls = OmegaConf.create(OmegaConf.to_container(ctc_decoding_cls)) ctc_decoding_cfg = OmegaConf.merge(ctc_decoding_cls, ctc_decoding_cfg) - self.ctc_decoding = CTCBPEDecoding(decoding_cfg=ctc_decoding_cfg, tokenizer=self.tokenizer) + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + else: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) self.ctc_wer = WERBPE( decoding=self.ctc_decoding, @@ -444,9 +452,14 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + self.decoding = RNNTBPEDecoding( + decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()) + ) + else: + self.decoding = RNNTBPEDecoding( + decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, + ) self.wer = RNNTBPEWER( decoding=self.decoding, @@ -483,7 +496,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - self.ctc_decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer) + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + else: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) self.ctc_wer = WERBPE( decoding=self.ctc_decoding, diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py index 6f6bfcb77..47efcd122 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py @@ -104,6 +104,7 @@ def transcribe( augmentor: DictConfig = None, verbose: bool = True, logprobs: bool = False, + language_id: str = None, ) -> (List[str], Optional[List['Hypothesis']]): """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. @@ -133,6 +134,7 @@ def transcribe( f"{self.cur_decoder} is not supported for cur_decoder. Supported values are ['ctc', 'rnnt']" ) if self.cur_decoder == "rnnt": + logging.info("Running with RNN-T decoder..") return super().transcribe( paths2audio_files=paths2audio_files, batch_size=batch_size, @@ -142,7 +144,10 @@ def transcribe( channel_selector=channel_selector, augmentor=augmentor, verbose=verbose, + language_id = language_id ) + + logging.info("Running with CTC decoder..") if paths2audio_files is None or len(paths2audio_files) == 0: return {} @@ -194,13 +199,18 @@ def transcribe( temporary_datalayer = self._setup_transcribe_dataloader(config) logits_list = [] for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): + signal, signal_len, _, _ = test_batch + if "multisoftmax" not in self.cfg.decoder: + language_ids = None + else: + language_ids = [language_id] * len(signal) encoded, encoded_len = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) + input_signal=signal.to(device), input_signal_length=signal_len.to(device) ) - logits = self.ctc_decoder(encoder_output=encoded) + logits = self.ctc_decoder(encoder_output=encoded, language_ids=language_ids) best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor( - logits, encoded_len, return_hypotheses=return_hypotheses, + logits, encoded_len, return_hypotheses=return_hypotheses, lang_ids=language_ids, ) logits = logits.cpu() diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index 30d66df5e..7edbd56b0 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -218,6 +218,7 @@ def transcribe( channel_selector: Optional[ChannelSelectorType] = None, augmentor: DictConfig = None, verbose: bool = True, + language_id: str = None, ) -> Tuple[List[str], Optional[List['Hypothesis']]]: """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. @@ -286,15 +287,21 @@ def transcribe( config['augmentor'] = augmentor temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=(not verbose)): + for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): + signal, signal_len, _, _ = test_batch + if "multisoftmax" not in self.cfg.decoder: + language_ids = None + else: + language_ids = [language_id] * len(signal) encoded, encoded_len = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) + input_signal=signal.to(device), input_signal_length=signal_len.to(device) ) best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor( encoded, encoded_len, return_hypotheses=return_hypotheses, partial_hypotheses=partial_hypothesis, + lang_ids=language_ids, ) hypotheses += best_hyp From b6ca4501b9063b1a2e0d14c91d3aca404c26467a Mon Sep 17 00:00:00 2001 From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com> Date: Tue, 6 Feb 2024 14:35:57 +0530 Subject: [PATCH 32/35] Update hybrid_rnnt_ctc_bpe_models.py --- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index feafcbc30..fb72bef41 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -338,9 +338,14 @@ def change_vocabulary( decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + self.decoding = RNNTBPEDecoding( + decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()) + ) + else: + self.decoding = RNNTBPEDecoding( + decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, + ) self.wer = RNNTBPEWER( decoding=self.decoding, @@ -405,7 +410,10 @@ def change_vocabulary( ctc_decoding_cls = OmegaConf.create(OmegaConf.to_container(ctc_decoding_cls)) ctc_decoding_cfg = OmegaConf.merge(ctc_decoding_cls, ctc_decoding_cfg) - self.ctc_decoding = CTCBPEDecoding(decoding_cfg=ctc_decoding_cfg, tokenizer=self.tokenizer) + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + else: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) self.ctc_wer = WERBPE( decoding=self.ctc_decoding, @@ -444,9 +452,14 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + self.decoding = RNNTBPEDecoding( + decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()) + ) + else: + self.decoding = RNNTBPEDecoding( + decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, + ) self.wer = RNNTBPEWER( decoding=self.decoding, @@ -483,7 +496,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - self.ctc_decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer) + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + else: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) self.ctc_wer = WERBPE( decoding=self.ctc_decoding, From 29b9f7b5efc4793c98009373b59ed81b5718d0b5 Mon Sep 17 00:00:00 2001 From: ASR Date: Wed, 21 Feb 2024 21:38:56 +0530 Subject: [PATCH 33/35] added decoding fixes --- nemo/collections/asr/models/ctc_bpe_models.py | 12 ++++++++++-- nemo/collections/asr/models/ctc_models.py | 9 +++++++-- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 5 ++++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index a9577f2bd..8e6b10f03 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -324,8 +324,16 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig): decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer,) + + if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: + if decoding_cfg.strategy == 'pyctcdecode': + # create separate decoders for each language + # self.decoding = [CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys()),lang=l) for l in self.tokenizer.tokenizers_dict.keys()] + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys()),lang='any') + else: + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + else: + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer) self._wer = WERBPE( decoding=self.decoding, diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 7d7b10b94..123107221 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -122,6 +122,7 @@ def transcribe( channel_selector: Optional[ChannelSelectorType] = None, augmentor: DictConfig = None, verbose: bool = True, + language_id: str = None ) -> List[str]: """ If modify this function, please remember update transcribe_partial_audio() in @@ -197,8 +198,12 @@ def transcribe( temporary_datalayer = self._setup_transcribe_dataloader(config) for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): + if "multisoftmax" not in self.cfg.decoder: + language_ids = None + else: + language_ids = [language_id] * len(test_batch[0]) logits, logits_len, greedy_predictions = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) + input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device),language_ids=language_ids ) if logprobs: @@ -208,7 +213,7 @@ def transcribe( hypotheses.append(lg.cpu().numpy()) else: current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor( - logits, decoder_lengths=logits_len, return_hypotheses=return_hypotheses, + logits, decoder_lengths=logits_len, return_hypotheses=return_hypotheses, lang_ids=language_ids ) logits = logits.cpu() diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index fb72bef41..aee80b2a6 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -148,7 +148,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): with open_dict(self.cfg.aux_ctc): self.cfg.aux_ctc.decoding = ctc_decoding_cfg if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in cfg.decoder: - self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + if ctc_decoding_cfg.strategy == 'pyctcdecode': + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys()),lang='any') + else: + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) else: self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) From 438aa07422ac4043aaa0c20ed90f7242dba2ef44 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 21 Feb 2024 21:49:01 +0000 Subject: [PATCH 34/35] fixed multisoftmax for single language --- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index aee80b2a6..18dfe484d 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -104,21 +104,21 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.language_masks[language] = [(token_language == language) for _, token_language in self.tokenizer.langs_by_token_id.items()] self.language_masks[language].append(True) # Insert blank token self.ctc_loss = CTCLoss( - num_classes=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()), + num_classes=(self.ctc_decoder._num_classes-1 )// len(self.tokenizer.tokenizers_dict.keys()), zero_infinity=True, reduction=self.cfg.aux_ctc.get("ctc_reduction", "mean_batch"), ) # Setup RNNT Loss loss_name, loss_kwargs = self.extract_rnnt_loss_cfg(self.cfg.get("loss", None)) self.loss = RNNTLoss( - num_classes=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()), + num_classes=(self.ctc_decoder._num_classes-1) // len(self.tokenizer.tokenizers_dict.keys()), loss_name=loss_name, loss_kwargs=loss_kwargs, reduction=self.cfg.get("rnnt_reduction", "mean_batch"), ) # Setup decoding object self.decoding = RNNTBPEDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()) + decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=(self.ctc_decoder._num_classes-1) // len(self.tokenizer.tokenizers_dict.keys()) ) self.decoder.language_masks = self.language_masks @@ -148,10 +148,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): with open_dict(self.cfg.aux_ctc): self.cfg.aux_ctc.decoding = ctc_decoding_cfg if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in cfg.decoder: + breakpoint() if ctc_decoding_cfg.strategy == 'pyctcdecode': - self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys()),lang='any') + self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=(self.ctc_decoder._num_classes-1)//len(self.tokenizer.tokenizers_dict.keys()),lang='any') else: - self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=(self.ctc_decoder._num_classes-1)//len(self.tokenizer.tokenizers_dict.keys())) else: self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) @@ -343,7 +344,7 @@ def change_vocabulary( if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: self.decoding = RNNTBPEDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()) + decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=(self.ctc_decoder._num_classes-1) // len(self.tokenizer.tokenizers_dict.keys()) ) else: self.decoding = RNNTBPEDecoding( @@ -414,7 +415,7 @@ def change_vocabulary( ctc_decoding_cfg = OmegaConf.merge(ctc_decoding_cls, ctc_decoding_cfg) if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: - self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=(self.ctc_decoder._num_classes-1)//len(self.tokenizer.tokenizers_dict.keys())) else: self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) @@ -457,7 +458,7 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: self.decoding = RNNTBPEDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes // len(self.tokenizer.tokenizers_dict.keys()) + decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, blank_id=(self.ctc_decoder._num_classes-1) // len(self.tokenizer.tokenizers_dict.keys()) ) else: self.decoding = RNNTBPEDecoding( @@ -500,7 +501,7 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in self.cfg.decoder: - self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=self.ctc_decoder._num_classes//len(self.tokenizer.tokenizers_dict.keys())) + self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer, blank_id=(self.ctc_decoder._num_classes-1)//len(self.tokenizer.tokenizers_dict.keys())) else: self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) From fc968b01add44b2de36e9a013389170fa35ddedd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 21 Feb 2024 21:54:09 +0000 Subject: [PATCH 35/35] fixed multisoftmax for single language --- nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index 18dfe484d..87231e449 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -148,7 +148,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): with open_dict(self.cfg.aux_ctc): self.cfg.aux_ctc.decoding = ctc_decoding_cfg if (self.tokenizer_type == "agg" or self.tokenizer_type == "multilingual") and "multisoftmax" in cfg.decoder: - breakpoint() if ctc_decoding_cfg.strategy == 'pyctcdecode': self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer, blank_id=(self.ctc_decoder._num_classes-1)//len(self.tokenizer.tokenizers_dict.keys()),lang='any') else: