Adds Video Surfer to autogen_ext (microsoft#4387)

* Add initial code * Update and add readme * Update * update readme * Refine action space * Add tutorial * Improve doc string * Improve doc string * Remove readme * Update toml * Update assignment * Expose the tools * Update pyproject toml * Improve docs * remove assignment' * Run poe format * Update uv lock * Fix mypy errors * Fix linting errors * poe format * run checks * Updaye * Rename submodule * Improve documentation --------- Co-authored-by: Mohammad Mazraeh <[email protected]> Co-authored-by: Ryan Sweet <[email protected]>
DavidLuong98 · Nov 29, 2024 · b6a7d56 · b6a7d56
1 parent f70869f
commit b6a7d56
Show file tree

Hide file tree

Showing 8 changed files with 691 additions and 362 deletions.
diff --git a/python/packages/autogen-ext/pyproject.toml b/python/packages/autogen-ext/pyproject.toml
@@ -31,6 +31,13 @@ web-surfer = [
     "playwright>=1.48.0",
     "pillow>=11.0.0",
 ]
+video-surfer = [
+    "autogen-agentchat==0.4.0.dev8",
+    "openai",
+    "opencv-python>=4.5",
+    "ffmpeg-python",
+    "openai-whisper",
+]
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/autogen_ext"]

diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/__init__.py b/python/packages/autogen-ext/src/autogen_ext/agents/__init__.py
@@ -1,4 +1,3 @@
 from ._openai_assistant_agent import OpenAIAssistantAgent
-from .web_surfer._multimodal_web_surfer import MultimodalWebSurfer
 
-__all__ = ["MultimodalWebSurfer", "OpenAIAssistantAgent"]
+__all__ = ["OpenAIAssistantAgent"]
diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/video_surfer/__init__.py b/python/packages/autogen-ext/src/autogen_ext/agents/video_surfer/__init__.py
@@ -0,0 +1,3 @@
+from ._video_surfer import VideoSurferAgent
+
+__all__ = ["VideoSurferAgent"]
diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/video_surfer/_video_surfer.py b/python/packages/autogen-ext/src/autogen_ext/agents/video_surfer/_video_surfer.py
@@ -0,0 +1,154 @@
+from typing import Any, Awaitable, Callable, List, Optional
+
+from autogen_agentchat.agents import AssistantAgent
+from autogen_core.components.models import ChatCompletionClient
+from autogen_core.components.tools import Tool
+
+from .tools import (
+    extract_audio,
+    get_screenshot_at,
+    get_video_length,
+    openai_transcribe_video_screenshot,
+    save_screenshot,
+    transcribe_audio_with_timestamps,
+)
+
+
+class VideoSurferAgent(AssistantAgent):
+    """
+    VideoSurferAgent is a specialized agent designed to answer questions about a local video file.
+
+    This agent utilizes various tools to extract information from the video, such as its length, screenshots at specific timestamps, and audio transcriptions. It processes these elements to provide detailed answers to user queries.
+
+    Available tools:
+
+    - :func:`~autogen_ext.agents.video_surfer.tools.extract_audio`
+    - :func:`~autogen_ext.agents.video_surfer.tools.get_video_length`
+    - :func:`~autogen_ext.agents.video_surfer.tools.transcribe_audio_with_timestamps`
+    - :func:`~autogen_ext.agents.video_surfer.tools.get_screenshot_at`
+    - :func:`~autogen_ext.agents.video_surfer.tools.save_screenshot`
+    - :func:`~autogen_ext.agents.video_surfer.tools.openai_transcribe_video_screenshot`
+
+    Example usage:
+
+        The following example demonstrates how to create an video surfing agent with
+        a model client and generate a response to a simple query about a local video
+        called video.mp4.
+
+        .. code-block:: python
+
+
+            import asyncio
+            from autogen_agentchat.task import Console, TextMentionTermination
+            from autogen_agentchat.teams import RoundRobinGroupChat
+            from autogen_ext.models import OpenAIChatCompletionClient
+            from autogen_ext.agents.video_surfer import VideoSurferAgent
+
+            async def main() -> None:
+                \"\"\"
+                Main function to run the video agent.
+                \"\"\"
+                # Define an agent
+                video_agent = VideoSurferAgent(
+                    name="VideoSurferAgent",
+                    model_client=OpenAIChatCompletionClient(model="gpt-4o-2024-08-06")
+                    )
+
+                # Define termination condition
+                termination = TextMentionTermination("TERMINATE")
+
+                # Define a team
+                agent_team = RoundRobinGroupChat([video_agent], termination_condition=termination)
+
+                # Run the team and stream messages to the console
+                stream = agent_team.run_stream(task="How does Adam define complex tasks in video.mp4? What concrete example of complex does his use? Can you save this example to disk as well?")
+                await Console(stream)
+
+            asyncio.run(main())
+
+        The following example demonstrates how to create and use a VideoSurferAgent and UserProxyAgent with MagenticOneGroupChat.
+
+        .. code-block:: python
+
+            import asyncio
+
+            from autogen_agentchat.task import Console
+            from autogen_agentchat.teams import MagenticOneGroupChat
+            from autogen_agentchat.agents import UserProxyAgent
+            from autogen_ext.models import OpenAIChatCompletionClient
+            from autogen_ext.agents.video_surfer import VideoSurferAgent
+
+            async def main() -> None:
+                \"\"\"
+                Main function to run the video agent.
+                \"\"\"
+
+                model_client = OpenAIChatCompletionClient(model="gpt-4o-2024-08-06")
+
+                # Define an agent
+                video_agent = VideoSurferAgent(
+                    name="VideoSurferAgent",
+                    model_client=model_client
+                    )
+
+                web_surfer_agent = UserProxyAgent(
+                    name="User"
+                )
+
+                # Define a team
+                agent_team = MagenticOneGroupChat([web_surfer_agent, video_agent], model_client=model_client,)
+
+                # Run the team and stream messages to the console
+                stream = agent_team.run_stream(task="Find a latest video about magentic one on youtube and extract quotes from it that make sense.")
+                await Console(stream)
+
+            asyncio.run(main())
+    """
+
+    DEFAULT_DESCRIPTION = "An agent that can answer questions about a local video."
+
+    DEFAULT_SYSTEM_MESSAGE = """
+    You are a helpful agent that is an expert at answering questions from a video.
+    When asked to answer a question about a video, you should:
+    1. Check if that video is available locally.
+    2. Use the transcription to find which part of the video the question is referring to.
+    3. Optionally use screenshots from those timestamps
+    4. Provide a detailed answer to the question.
+    Reply with TERMINATE when the task has been completed.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        model_client: ChatCompletionClient,
+        *,
+        tools: List[Tool | Callable[..., Any] | Callable[..., Awaitable[Any]]] | None = None,
+        description: Optional[str] = None,
+        system_message: Optional[str] = None,
+    ):
+        """
+        Initialize the VideoSurferAgent.
+
+        Args:
+            name (str): The name of the agent.
+            model_client (ChatCompletionClient): The model client used for generating responses.
+            tools (List[Tool | Callable[..., Any] | Callable[..., Awaitable[Any]]] | None, optional):
+                A list of tools or functions the agent can use. If not provided, defaults to all video tools from the action space.
+            description (str, optional): A brief description of the agent. Defaults to "An agent that can answer questions about a local video.".
+            system_message (str | None, optional): The system message guiding the agent's behavior. Defaults to a predefined message.
+        """
+        super().__init__(
+            name=name,
+            model_client=model_client,
+            tools=tools
+            or [
+                get_video_length,
+                get_screenshot_at,
+                save_screenshot,
+                openai_transcribe_video_screenshot,
+                extract_audio,
+                transcribe_audio_with_timestamps,
+            ],
+            description=description or self.DEFAULT_DESCRIPTION,
+            system_message=system_message or self.DEFAULT_SYSTEM_MESSAGE,
+        )
diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/video_surfer/tools.py b/python/packages/autogen-ext/src/autogen_ext/agents/video_surfer/tools.py
@@ -0,0 +1,162 @@
+import base64
+from typing import Any, Dict, List, Tuple
+
+import cv2
+import ffmpeg
+import numpy as np
+import openai
+import whisper
+
+
+def extract_audio(video_path: str, audio_output_path: str) -> str:
+    """
+    Extracts audio from a video file and saves it as an MP3 file.
+
+    :param video_path: Path to the video file.
+    :param audio_output_path: Path to save the extracted audio file.
+    :return: Confirmation message with the path to the saved audio file.
+    """
+    (ffmpeg.input(video_path).output(audio_output_path, format="mp3").run(quiet=True, overwrite_output=True))  # type: ignore
+    return f"Audio extracted and saved to {audio_output_path}."
+
+
+def transcribe_audio_with_timestamps(audio_path: str) -> str:
+    """
+    Transcribes the audio file with timestamps using the Whisper model.
+
+    :param audio_path: Path to the audio file.
+    :return: Transcription with timestamps.
+    """
+    model = whisper.load_model("base")  # type: ignore
+    result: Dict[str, Any] = model.transcribe(audio_path, task="transcribe", language="en", verbose=False)  # type: ignore
+
+    segments: List[Dict[str, Any]] = result["segments"]
+    transcription_with_timestamps = ""
+
+    for segment in segments:
+        start: float = segment["start"]
+        end: float = segment["end"]
+        text: str = segment["text"]
+        transcription_with_timestamps += f"[{start:.2f} - {end:.2f}] {text}\n"
+
+    return transcription_with_timestamps
+
+
+def get_video_length(video_path: str) -> str:
+    """
+    Returns the length of the video in seconds.
+
+    :param video_path: Path to the video file.
+    :return: Duration of the video in seconds.
+    """
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise IOError(f"Cannot open video file {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+    duration = frame_count / fps
+    cap.release()
+
+    return f"The video is {duration:.2f} seconds long."
+
+
+def save_screenshot(video_path: str, timestamp: float, output_path: str) -> None:
+    """
+    Captures a screenshot at the specified timestamp and saves it to the output path.
+
+    :param video_path: Path to the video file.
+    :param timestamp: Timestamp in seconds.
+    :param output_path: Path to save the screenshot.
+    """
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise IOError(f"Cannot open video file {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_number = int(timestamp * fps)
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+    ret, frame = cap.read()
+    if ret:
+        cv2.imwrite(output_path, frame)
+    else:
+        raise IOError(f"Failed to capture frame at {timestamp:.2f}s")
+    cap.release()
+
+
+def openai_transcribe_video_screenshot(video_path: str, timestamp: float) -> str:
+    """
+    Transcribes the content of a video screenshot captured at the specified timestamp using OpenAI API.
+
+    :param video_path: Path to the video file.
+    :param timestamp: Timestamp in seconds.
+    :return: Description of the screenshot content.
+    """
+    screenshots = get_screenshot_at(video_path, [timestamp])
+    if not screenshots:
+        return "Failed to capture screenshot."
+
+    _, frame = screenshots[0]
+    # Convert the frame to bytes and then to base64 encoding
+    _, buffer = cv2.imencode(".jpg", frame)
+    frame_bytes = buffer.tobytes()
+    frame_base64 = base64.b64encode(frame_bytes).decode("utf-8")
+
+    client = openai.Client()
+
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Following is a screenshot from the video at {} seconds. Describe what you see here.".format(
+                            timestamp
+                        ),
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{frame_base64}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    return str(response.choices[0].message.content)
+
+
+def get_screenshot_at(video_path: str, timestamps: List[float]) -> List[Tuple[float, np.ndarray[Any, Any]]]:
+    """
+    Captures screenshots at the specified timestamps and returns them as Python objects.
+
+    :param video_path: Path to the video file.
+    :param timestamps: List of timestamps in seconds.
+    :return: List of tuples containing timestamp and the corresponding frame (image).
+             Each frame is a NumPy array (height x width x channels).
+    """
+    screenshots: List[Tuple[float, np.ndarray[Any, Any]]] = []
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise IOError(f"Cannot open video file {video_path}")
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+    duration = total_frames / fps
+
+    for timestamp in timestamps:
+        if 0 <= timestamp <= duration:
+            frame_number = int(timestamp * fps)
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+            ret, frame = cap.read()
+            if ret:
+                # Append the timestamp and frame to the list
+                screenshots.append((timestamp, frame))
+            else:
+                raise IOError(f"Failed to capture frame at {timestamp:.2f}s")
+        else:
+            raise ValueError(f"Timestamp {timestamp:.2f}s is out of range [0s, {duration:.2f}s]")
+
+    cap.release()
+    return screenshots
diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/__init__.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/__init__.py
@@ -0,0 +1,3 @@
+from ._multimodal_web_surfer import MultimodalWebSurfer
+
+__all__ = ["MultimodalWebSurfer"]
diff --git a/python/packages/autogen-studio/autogenstudio/database/component_factory.py b/python/packages/autogen-studio/autogenstudio/database/component_factory.py
@@ -10,7 +10,7 @@
 from autogen_agentchat.task import MaxMessageTermination, StopMessageTermination, TextMentionTermination
 from autogen_agentchat.teams import RoundRobinGroupChat, SelectorGroupChat
 from autogen_core.components.tools import FunctionTool
-from autogen_ext.agents import MultimodalWebSurfer
+from autogen_ext.agents.web_surfer import MultimodalWebSurfer
 from autogen_ext.models import OpenAIChatCompletionClient
 
 from ..datamodel.types import (
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from ._video_surfer import VideoSurferAgent

		__all__ = ["VideoSurferAgent"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from ._multimodal_web_surfer import MultimodalWebSurfer

		__all__ = ["MultimodalWebSurfer"]