From fe42a1ceee0d0237816527036f1664056272e21f Mon Sep 17 00:00:00 2001
From: ldwang <ldwang@baai.ac.cn>
Date: Thu, 31 Oct 2024 16:58:23 +0800
Subject: [PATCH] Create audio_ transcribe.py

---
 examples/whisper/audio_ transcribe.py | 31 +++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 examples/whisper/audio_ transcribe.py

diff --git a/examples/whisper/audio_ transcribe.py b/examples/whisper/audio_ transcribe.py
new file mode 100644
index 00000000..0e980340
--- /dev/null
+++ b/examples/whisper/audio_ transcribe.py	
@@ -0,0 +1,31 @@
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from datasets import load_dataset
+
+
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+
+model_id = "openai/whisper-large-v3-turbo"
+
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+
+processor = AutoProcessor.from_pretrained(model_id)
+
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    device=device,
+)
+
+dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
+sample = dataset[0]["audio"]
+
+result = pipe("audio.mp3")
+print(result["text"])