diff --git a/README.md b/README.md index d4b0b80..874f119 100644 --- a/README.md +++ b/README.md @@ -153,21 +153,23 @@ The files for the CTranslate2 versions of Moonshine are available at [huggingfac Both models are also available on the HuggingFace hub and can be used with the `transformers` library, as follows: ```python -from transformers import AutoModelForSpeechSeq2Seq, AutoConfig, PreTrainedTokenizerFast +import torch +from transformers import AutoProcessor, MoonshineForConditionalGeneration +from datasets import load_dataset -import torchaudio -import sys +processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny") +model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny") -audio, sr = torchaudio.load(sys.argv[1]) -if sr != 16000: - audio = torchaudio.functional.resample(audio, sr, 16000) +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +audio_array = ds[0]["audio"]["array"] -# 'usefulsensors/moonshine-base' for the base model -model = AutoModelForSpeechSeq2Seq.from_pretrained('usefulsensors/moonshine-tiny', trust_remote_code=True) -tokenizer = PreTrainedTokenizerFast.from_pretrained('usefulsensors/moonshine-tiny') +inputs = processor(audio_array, return_tensors="pt") +input_values = inputs.input_values -tokens = model(audio) -print(tokenizer.decode(tokens[0], skip_special_tokens=True)) +generated_ids = model.generate(input_values, max_new_tokens=100) + +transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(transcription) ``` ## TODO