diff --git a/examples/whisper/audio_ transcribe.py b/examples/whisper/audio_ transcribe.py new file mode 100644 index 00000000..0e980340 --- /dev/null +++ b/examples/whisper/audio_ transcribe.py @@ -0,0 +1,31 @@ +import torch +from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline +from datasets import load_dataset + + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + +model_id = "openai/whisper-large-v3-turbo" + +model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True +) +model.to(device) + +processor = AutoProcessor.from_pretrained(model_id) + +pipe = pipeline( + "automatic-speech-recognition", + model=model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + torch_dtype=torch_dtype, + device=device, +) + +dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation") +sample = dataset[0]["audio"] + +result = pipe("audio.mp3") +print(result["text"])