Skip to content

Commit

Permalink
Adds Video Surfer to autogen_ext (microsoft#4387)
Browse files Browse the repository at this point in the history
* Add initial code

* Update and add readme

* Update

* update readme

* Refine action space

* Add tutorial

* Improve doc string

* Improve doc string

* Remove readme

* Update toml

* Update assignment

* Expose the tools

* Update pyproject toml

* Improve docs

* remove assignment'

* Run poe format

* Update uv lock

* Fix mypy errors

* Fix linting errors

* poe format

* run checks

* Updaye

* Rename submodule

* Improve documentation

---------

Co-authored-by: Mohammad Mazraeh <[email protected]>
Co-authored-by: Ryan Sweet <[email protected]>
  • Loading branch information
3 people authored Nov 29, 2024
1 parent f70869f commit b6a7d56
Show file tree
Hide file tree
Showing 8 changed files with 691 additions and 362 deletions.
7 changes: 7 additions & 0 deletions python/packages/autogen-ext/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ web-surfer = [
"playwright>=1.48.0",
"pillow>=11.0.0",
]
video-surfer = [
"autogen-agentchat==0.4.0.dev8",
"openai",
"opencv-python>=4.5",
"ffmpeg-python",
"openai-whisper",
]

[tool.hatch.build.targets.wheel]
packages = ["src/autogen_ext"]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from ._openai_assistant_agent import OpenAIAssistantAgent
from .web_surfer._multimodal_web_surfer import MultimodalWebSurfer

__all__ = ["MultimodalWebSurfer", "OpenAIAssistantAgent"]
__all__ = ["OpenAIAssistantAgent"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._video_surfer import VideoSurferAgent

__all__ = ["VideoSurferAgent"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from typing import Any, Awaitable, Callable, List, Optional

from autogen_agentchat.agents import AssistantAgent
from autogen_core.components.models import ChatCompletionClient
from autogen_core.components.tools import Tool

from .tools import (
extract_audio,
get_screenshot_at,
get_video_length,
openai_transcribe_video_screenshot,
save_screenshot,
transcribe_audio_with_timestamps,
)


class VideoSurferAgent(AssistantAgent):
"""
VideoSurferAgent is a specialized agent designed to answer questions about a local video file.
This agent utilizes various tools to extract information from the video, such as its length, screenshots at specific timestamps, and audio transcriptions. It processes these elements to provide detailed answers to user queries.
Available tools:
- :func:`~autogen_ext.agents.video_surfer.tools.extract_audio`
- :func:`~autogen_ext.agents.video_surfer.tools.get_video_length`
- :func:`~autogen_ext.agents.video_surfer.tools.transcribe_audio_with_timestamps`
- :func:`~autogen_ext.agents.video_surfer.tools.get_screenshot_at`
- :func:`~autogen_ext.agents.video_surfer.tools.save_screenshot`
- :func:`~autogen_ext.agents.video_surfer.tools.openai_transcribe_video_screenshot`
Example usage:
The following example demonstrates how to create an video surfing agent with
a model client and generate a response to a simple query about a local video
called video.mp4.
.. code-block:: python
import asyncio
from autogen_agentchat.task import Console, TextMentionTermination
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_ext.models import OpenAIChatCompletionClient
from autogen_ext.agents.video_surfer import VideoSurferAgent
async def main() -> None:
\"\"\"
Main function to run the video agent.
\"\"\"
# Define an agent
video_agent = VideoSurferAgent(
name="VideoSurferAgent",
model_client=OpenAIChatCompletionClient(model="gpt-4o-2024-08-06")
)
# Define termination condition
termination = TextMentionTermination("TERMINATE")
# Define a team
agent_team = RoundRobinGroupChat([video_agent], termination_condition=termination)
# Run the team and stream messages to the console
stream = agent_team.run_stream(task="How does Adam define complex tasks in video.mp4? What concrete example of complex does his use? Can you save this example to disk as well?")
await Console(stream)
asyncio.run(main())
The following example demonstrates how to create and use a VideoSurferAgent and UserProxyAgent with MagenticOneGroupChat.
.. code-block:: python
import asyncio
from autogen_agentchat.task import Console
from autogen_agentchat.teams import MagenticOneGroupChat
from autogen_agentchat.agents import UserProxyAgent
from autogen_ext.models import OpenAIChatCompletionClient
from autogen_ext.agents.video_surfer import VideoSurferAgent
async def main() -> None:
\"\"\"
Main function to run the video agent.
\"\"\"
model_client = OpenAIChatCompletionClient(model="gpt-4o-2024-08-06")
# Define an agent
video_agent = VideoSurferAgent(
name="VideoSurferAgent",
model_client=model_client
)
web_surfer_agent = UserProxyAgent(
name="User"
)
# Define a team
agent_team = MagenticOneGroupChat([web_surfer_agent, video_agent], model_client=model_client,)
# Run the team and stream messages to the console
stream = agent_team.run_stream(task="Find a latest video about magentic one on youtube and extract quotes from it that make sense.")
await Console(stream)
asyncio.run(main())
"""

DEFAULT_DESCRIPTION = "An agent that can answer questions about a local video."

DEFAULT_SYSTEM_MESSAGE = """
You are a helpful agent that is an expert at answering questions from a video.
When asked to answer a question about a video, you should:
1. Check if that video is available locally.
2. Use the transcription to find which part of the video the question is referring to.
3. Optionally use screenshots from those timestamps
4. Provide a detailed answer to the question.
Reply with TERMINATE when the task has been completed.
"""

def __init__(
self,
name: str,
model_client: ChatCompletionClient,
*,
tools: List[Tool | Callable[..., Any] | Callable[..., Awaitable[Any]]] | None = None,
description: Optional[str] = None,
system_message: Optional[str] = None,
):
"""
Initialize the VideoSurferAgent.
Args:
name (str): The name of the agent.
model_client (ChatCompletionClient): The model client used for generating responses.
tools (List[Tool | Callable[..., Any] | Callable[..., Awaitable[Any]]] | None, optional):
A list of tools or functions the agent can use. If not provided, defaults to all video tools from the action space.
description (str, optional): A brief description of the agent. Defaults to "An agent that can answer questions about a local video.".
system_message (str | None, optional): The system message guiding the agent's behavior. Defaults to a predefined message.
"""
super().__init__(
name=name,
model_client=model_client,
tools=tools
or [
get_video_length,
get_screenshot_at,
save_screenshot,
openai_transcribe_video_screenshot,
extract_audio,
transcribe_audio_with_timestamps,
],
description=description or self.DEFAULT_DESCRIPTION,
system_message=system_message or self.DEFAULT_SYSTEM_MESSAGE,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import base64
from typing import Any, Dict, List, Tuple

import cv2
import ffmpeg
import numpy as np
import openai
import whisper


def extract_audio(video_path: str, audio_output_path: str) -> str:
"""
Extracts audio from a video file and saves it as an MP3 file.
:param video_path: Path to the video file.
:param audio_output_path: Path to save the extracted audio file.
:return: Confirmation message with the path to the saved audio file.
"""
(ffmpeg.input(video_path).output(audio_output_path, format="mp3").run(quiet=True, overwrite_output=True)) # type: ignore
return f"Audio extracted and saved to {audio_output_path}."


def transcribe_audio_with_timestamps(audio_path: str) -> str:
"""
Transcribes the audio file with timestamps using the Whisper model.
:param audio_path: Path to the audio file.
:return: Transcription with timestamps.
"""
model = whisper.load_model("base") # type: ignore
result: Dict[str, Any] = model.transcribe(audio_path, task="transcribe", language="en", verbose=False) # type: ignore

segments: List[Dict[str, Any]] = result["segments"]
transcription_with_timestamps = ""

for segment in segments:
start: float = segment["start"]
end: float = segment["end"]
text: str = segment["text"]
transcription_with_timestamps += f"[{start:.2f} - {end:.2f}] {text}\n"

return transcription_with_timestamps


def get_video_length(video_path: str) -> str:
"""
Returns the length of the video in seconds.
:param video_path: Path to the video file.
:return: Duration of the video in seconds.
"""
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise IOError(f"Cannot open video file {video_path}")
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
duration = frame_count / fps
cap.release()

return f"The video is {duration:.2f} seconds long."


def save_screenshot(video_path: str, timestamp: float, output_path: str) -> None:
"""
Captures a screenshot at the specified timestamp and saves it to the output path.
:param video_path: Path to the video file.
:param timestamp: Timestamp in seconds.
:param output_path: Path to save the screenshot.
"""
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise IOError(f"Cannot open video file {video_path}")
fps = cap.get(cv2.CAP_PROP_FPS)
frame_number = int(timestamp * fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
ret, frame = cap.read()
if ret:
cv2.imwrite(output_path, frame)
else:
raise IOError(f"Failed to capture frame at {timestamp:.2f}s")
cap.release()


def openai_transcribe_video_screenshot(video_path: str, timestamp: float) -> str:
"""
Transcribes the content of a video screenshot captured at the specified timestamp using OpenAI API.
:param video_path: Path to the video file.
:param timestamp: Timestamp in seconds.
:return: Description of the screenshot content.
"""
screenshots = get_screenshot_at(video_path, [timestamp])
if not screenshots:
return "Failed to capture screenshot."

_, frame = screenshots[0]
# Convert the frame to bytes and then to base64 encoding
_, buffer = cv2.imencode(".jpg", frame)
frame_bytes = buffer.tobytes()
frame_base64 = base64.b64encode(frame_bytes).decode("utf-8")

client = openai.Client()

response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Following is a screenshot from the video at {} seconds. Describe what you see here.".format(
timestamp
),
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{frame_base64}"},
},
],
}
],
)

return str(response.choices[0].message.content)


def get_screenshot_at(video_path: str, timestamps: List[float]) -> List[Tuple[float, np.ndarray[Any, Any]]]:
"""
Captures screenshots at the specified timestamps and returns them as Python objects.
:param video_path: Path to the video file.
:param timestamps: List of timestamps in seconds.
:return: List of tuples containing timestamp and the corresponding frame (image).
Each frame is a NumPy array (height x width x channels).
"""
screenshots: List[Tuple[float, np.ndarray[Any, Any]]] = []

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise IOError(f"Cannot open video file {video_path}")

fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
duration = total_frames / fps

for timestamp in timestamps:
if 0 <= timestamp <= duration:
frame_number = int(timestamp * fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
ret, frame = cap.read()
if ret:
# Append the timestamp and frame to the list
screenshots.append((timestamp, frame))
else:
raise IOError(f"Failed to capture frame at {timestamp:.2f}s")
else:
raise ValueError(f"Timestamp {timestamp:.2f}s is out of range [0s, {duration:.2f}s]")

cap.release()
return screenshots
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._multimodal_web_surfer import MultimodalWebSurfer

__all__ = ["MultimodalWebSurfer"]
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from autogen_agentchat.task import MaxMessageTermination, StopMessageTermination, TextMentionTermination
from autogen_agentchat.teams import RoundRobinGroupChat, SelectorGroupChat
from autogen_core.components.tools import FunctionTool
from autogen_ext.agents import MultimodalWebSurfer
from autogen_ext.agents.web_surfer import MultimodalWebSurfer
from autogen_ext.models import OpenAIChatCompletionClient

from ..datamodel.types import (
Expand Down
Loading

0 comments on commit b6a7d56

Please sign in to comment.