feat: audio transcription

digidem · Nov 27, 2024 · 41fc958 · 41fc958
1 parent 868d2b7
commit 41fc958
Show file tree

Hide file tree

Showing 3 changed files with 198 additions and 7 deletions.
diff --git a/apps/ai_api/eda_ai_api/api/routes/supervisor.py b/apps/ai_api/eda_ai_api/api/routes/supervisor.py
@@ -10,6 +10,10 @@
 from proposal_writer.crew import ProposalWriterCrew
 
 from eda_ai_api.models.supervisor import SupervisorRequest, SupervisorResponse
+from eda_ai_api.utils.audio_converter import convert_ogg
+from eda_ai_api.utils.transcriber import transcribe_audio
+
+import tempfile
 
 router = APIRouter()
 
@@ -69,16 +73,112 @@
 async def supervisor_route(
     message: Optional[str] = Form(None), audio: Optional[UploadFile] = File(None)
 ) -> SupervisorResponse:
+    ALLOWED_FORMATS = {
+        "audio/mpeg": "mp3",
+        "audio/mp4": "mp4",
+        "audio/mpeg": "mpeg",
+        "audio/mpga": "mpga",
+        "audio/mp4": "m4a",
+        "audio/wav": "wav",
+        "audio/webm": "webm",
+        "audio/ogg": "ogg",
+    }
+
+    def detect_content_type(file: UploadFile) -> Optional[str]:
+        """Helper to detect content type from file"""
+        if hasattr(file, "content_type") and file.content_type:
+            return file.content_type
+
+        if hasattr(file, "mime_type") and file.mime_type:
+            return file.mime_type
+
+        ext = os.path.splitext(file.filename)[1].lower()
+        return {
+            ".mp3": "audio/mpeg",
+            ".mp4": "audio/mp4",
+            ".mpeg": "audio/mpeg",
+            ".mpga": "audio/mpga",
+            ".m4a": "audio/mp4",
+            ".wav": "audio/wav",
+            ".webm": "audio/webm",
+            ".ogg": "audio/ogg",
+        }.get(ext)
+
     try:
         if audio:
-            if audio.content_type != "audio/ogg":
-                return SupervisorResponse(
-                    result="Error: Only .ogg audio files are supported"
-                )
+            content_type = detect_content_type(audio)
+            content = await audio.read()
+
+            try:
+                audio_path = ""
+                # Default to mp3 if content type detection failed
+                if not content_type:
+                    content_type = "audio/mpeg"
+
+                if content_type == "audio/ogg":
+                    audio_path = convert_ogg(content, output_format="mp3")
+                else:
+                    with tempfile.NamedTemporaryFile(
+                        suffix=f".{ALLOWED_FORMATS.get(content_type, 'mp3')}",
+                        delete=False,
+                    ) as temp_file:
+                        temp_file.write(content)
+                        audio_path = temp_file.name
+
+                transcription = transcribe_audio(audio_path)
+                print("\n==================================================")
+                print(f"           TRANSCRIPTION: {transcription}")
+                print("==================================================\n")
+
+                if os.path.exists(audio_path):
+                    os.unlink(audio_path)
+
+                # Process transcription through the router chain
+                decision = router_chain.run(message=transcription).strip().lower()
+
+                # Continue with existing decision handling logic...
+                if decision == "discovery":
+                    topics_raw = topic_chain.run(message=transcription)
+                    topics = [t.strip() for t in topics_raw.split(",") if t.strip()][:5]
+                    if not topics:
+                        topics = ["AI", "Technology"]
+                    result = (
+                        OpportunityFinderCrew()
+                        .crew()
+                        .kickoff(inputs={"topics": ", ".join(topics)})
+                    )
+
+                elif decision == "proposal":
+                    extracted = proposal_chain.run(message=transcription).split("|")
+                    community_project = (
+                        extracted[0].strip() if len(extracted) > 0 else "unknown"
+                    )
+                    grant_call = (
+                        extracted[1].strip() if len(extracted) > 1 else "unknown"
+                    )
+                    result = (
+                        ProposalWriterCrew(
+                            community_project=community_project, grant_call=grant_call
+                        )
+                        .crew()
+                        .kickoff()
+                    )
+
+                elif decision == "heartbeat":
+                    result = {"is_alive": True}
+
+                elif decision == "onboarding":
+                    result = OnboardingCrew().crew().kickoff()
+
+                else:
+                    result = {"error": f"Unknown decision type: {decision}"}
+
+                return SupervisorResponse(result=str(result))
 
-            # Here you would add whisper transcription later
-            # For now just acknowledge we received the audio
-            return SupervisorResponse(result=f"Received audio file: {audio.filename}")
+            except Exception as e:
+                if os.path.exists(audio_path):
+                    os.unlink(audio_path)
+                return SupervisorResponse(result=f"Error processing audio: {str(e)}")
 
         elif message:
             # Existing message handling logic

diff --git a/apps/ai_api/eda_ai_api/utils/audio_converter.py b/apps/ai_api/eda_ai_api/utils/audio_converter.py
@@ -0,0 +1,57 @@
+import os
+from pathlib import Path
+import tempfile
+from typing import Literal
+import ffmpeg
+
+AudioFormat = Literal["mp3", "mp4", "mpeg", "mpga", "m4a", "wav", "webm"]
+
+
+def convert_ogg(
+    input_file: str | Path | bytes,
+    output_format: AudioFormat = "mp3",
+    output_path: str | Path | None = None,
+) -> str:
+    """
+    Convert OGG audio file to another format using ffmpeg.
+
+    Args:
+        input_file: Path to input OGG file or bytes content
+        output_format: Desired output format
+        output_path: Optional output path. If None, uses a temporary file
+
+    Returns:
+        str: Path to the converted audio file
+    """
+    try:
+        # Handle bytes input by writing to temp file first
+        if isinstance(input_file, bytes):
+            with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as temp_ogg:
+                temp_ogg.write(input_file)
+                input_file = temp_ogg.name
+
+        # If no output path specified, create temp file
+        if output_path is None:
+            temp_dir = tempfile.gettempdir()
+            output_path = os.path.join(temp_dir, f"converted_audio.{output_format}")
+
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+
+        # Convert audio using ffmpeg
+        stream = ffmpeg.input(str(input_file))
+        stream = ffmpeg.output(stream, str(output_path))
+        ffmpeg.run(
+            stream, overwrite_output=True, capture_stdout=True, capture_stderr=True
+        )
+
+        # Clean up temp input file if we created one
+        if isinstance(input_file, str) and input_file.startswith(tempfile.gettempdir()):
+            os.unlink(input_file)
+
+        return str(output_path)
+
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}") from e
+    except Exception as e:
+        raise RuntimeError(f"Error converting audio: {str(e)}") from e
diff --git a/apps/ai_api/eda_ai_api/utils/transcriber.py b/apps/ai_api/eda_ai_api/utils/transcriber.py
@@ -0,0 +1,34 @@
+import os
+from groq import Groq
+from loguru import logger
+
+
+def transcribe_audio(audio_path: str, language: str = "en") -> str:
+    """
+    Transcribe audio file using Groq's Whisper API.
+
+    Args:
+        audio_path: Path to the audio file
+        language: Language code (default: "en")
+
+    Returns:
+        str: Transcribed text
+    """
+    try:
+        client = Groq()
+
+        with open(audio_path, "rb") as file:
+            transcription = client.audio.transcriptions.create(
+                file=(audio_path, file.read()),
+                model="whisper-large-v3-turbo",
+                response_format="json",
+                # language=language,
+                temperature=0.0,
+            )
+
+            logger.info(f"Transcription result: {transcription.text}")
+            return transcription.text
+
+    except Exception as e:
+        logger.error(f"Error transcribing audio: {str(e)}")
+        raise RuntimeError(f"Transcription failed: {str(e)}") from e