diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 3414bc3aa..4974c2e3a 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,10 +1,12 @@ ## Description **Please include:** -- A summary of the changes and the related issue -- Relevant motivation and context -- Any dependencies or environment changes required -- If this modifies AI/ML components, include model performance metrics + +- **Summary of changes**: Clearly describe the key changes in this PR and their purpose. +- **Related issues**: Mention if this PR fixes or is connected to any issues (e.g., "Fixes #123" or "Relates to #456"). +- **Motivation and context**: Explain the reason for the changes and the problem they solve. +- **Environment or dependencies**: Specify any changes in dependencies or environment configurations required for this update. +- **Impact on AI/ML components**: (If applicable) Describe changes to AI/ML models and include performance metrics (e.g., accuracy, F1-score). Fixes # (issue) @@ -30,4 +32,4 @@ Please check the options that are relevant: ## Additional Notes -Include any deployment notes, performance implications, or other relevant information: \ No newline at end of file +Include any deployment notes, performance implications, or other relevant information: diff --git a/cookbook/agents/15_generate_video.py b/cookbook/agents/15_generate_video.py index 14fb66e9f..d20e3cc55 100644 --- a/cookbook/agents/15_generate_video.py +++ b/cookbook/agents/15_generate_video.py @@ -7,7 +7,7 @@ tools=[ModelsLabs()], description="You are an AI agent that can generate videos using the ModelsLabs API.", instructions=[ - "When the user asks you to create a video, use the `create_video` tool to create the video.", + "When the user asks you to create a video, use the `generate_media` tool to create the video.", "The video will be displayed in the UI automatically below your response, so you don't need to show the video URL in your response.", "Politely and courteously let the user know that the video has been generated and will be displayed below as soon as its ready.", ], diff --git a/cookbook/agents/43_generate_replicate_video.py b/cookbook/agents/43_generate_replicate_video.py new file mode 100644 index 000000000..f855abf23 --- /dev/null +++ b/cookbook/agents/43_generate_replicate_video.py @@ -0,0 +1,24 @@ +from phi.agent import Agent +from phi.model.openai import OpenAIChat +from phi.tools.replicate import ReplicateTools + +"""Create an agent specialized for Replicate AI content generation""" + +video_agent = Agent( + name="Video Generator Agent", + model=OpenAIChat(id="gpt-4o"), + tools=[ + ReplicateTools(model="tencent/hunyuan-video:847dfa8b01e739637fc76f480ede0c1d76408e1d694b830b5dfb8e547bf98405") + ], + description="You are an AI agent that can generate videos using the Replicate API.", + instructions=[ + "When the user asks you to create a video, use the `generate_media` tool to create the video.", + "Return the URL as raw to the user.", + "Don't convert video URL to markdown or anything else.", + ], + markdown=True, + debug_mode=True, + show_tool_calls=True, +) + +video_agent.print_response("Generate a video of a horse in the dessert.") diff --git a/cookbook/agents/44_generate_replicate_image.py b/cookbook/agents/44_generate_replicate_image.py new file mode 100644 index 000000000..ebf7af0bd --- /dev/null +++ b/cookbook/agents/44_generate_replicate_image.py @@ -0,0 +1,22 @@ +from phi.agent import Agent +from phi.model.openai import OpenAIChat +from phi.tools.replicate import ReplicateTools + +"""Create an agent specialized for Replicate AI content generation""" + +image_agent = Agent( + name="Image Generator Agent", + model=OpenAIChat(id="gpt-4o"), + tools=[ReplicateTools(model="luma/photon-flash")], + description="You are an AI agent that can generate images using the Replicate API.", + instructions=[ + "When the user asks you to create an image, use the `generate_media` tool to create the image.", + "Return the URL as raw to the user.", + "Don't convert image URL to markdown or anything else.", + ], + markdown=True, + debug_mode=True, + show_tool_calls=True, +) + +image_agent.print_response("Generate an image of a horse in the dessert.") diff --git a/cookbook/agents/45_generate_fal_video.py b/cookbook/agents/45_generate_fal_video.py new file mode 100644 index 000000000..8ed8139b4 --- /dev/null +++ b/cookbook/agents/45_generate_fal_video.py @@ -0,0 +1,20 @@ +from phi.agent import Agent +from phi.model.openai import OpenAIChat +from phi.tools.fal_tools import FalTools + +fal_agent = Agent( + name="Fal Video Generator Agent", + model=OpenAIChat(id="gpt-4o"), + tools=[FalTools("fal-ai/hunyuan-video")], + description="You are an AI agent that can generate videos using the Fal API.", + instructions=[ + "When the user asks you to create a video, use the `generate_media` tool to create the video.", + "Return the URL as raw to the user.", + "Don't convert video URL to markdown or anything else.", + ], + markdown=True, + debug_mode=True, + show_tool_calls=True, +) + +fal_agent.print_response("Generate video of balloon in the ocean") diff --git a/cookbook/examples/agents/01_ai_recipe_creator.py b/cookbook/examples/agents/01_ai_recipe_creator.py new file mode 100644 index 000000000..65c8cb856 --- /dev/null +++ b/cookbook/examples/agents/01_ai_recipe_creator.py @@ -0,0 +1,33 @@ +from phi.agent import Agent +from phi.knowledge.pdf import PDFUrlKnowledgeBase +from phi.vectordb.pgvector import PgVector +from phi.tools.exa import ExaTools + +db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai" + +knowledge_base = PDFUrlKnowledgeBase( + urls=[ + "https://www.poshantracker.in/pdf/Awareness/MilletsRecipeBook2023_Low%20Res_V5.pdf", + "https://www.cardiff.ac.uk/__data/assets/pdf_file/0003/123681/Recipe-Book.pdf", + ], + vector_db=PgVector(table_name="recipes", db_url=db_url), +) +knowledge_base.load(recreate=False) + +recipe_agent = Agent( + name="RecipeGenie", + knowledge_base=knowledge_base, + search_knowledge=True, + tools=[ExaTools()], + markdown=True, + instructions=[ + "Search for recipes based on the ingredients and time available from the knowledge base.", + "Include the exact calories, preparation time, cooking instructions, and highlight allergens for the recommended recipes.", + "Always search exa for recipe links or tips related to the recipes apart from knowledge base.", + "Provide a list of recipes that match the user's requirements and preferences.", + ], +) + +recipe_agent.print_response( + "I have potatoes, tomatoes, onions, garlic, ginger, and chicken. Suggest me a quick recipe for dinner", stream=True +) diff --git a/cookbook/examples/agents/03_itinerary_planner.py b/cookbook/examples/agents/03_itinerary_planner.py index b6564f330..50c94c850 100644 --- a/cookbook/examples/agents/03_itinerary_planner.py +++ b/cookbook/examples/agents/03_itinerary_planner.py @@ -14,7 +14,7 @@ "Ensure that the gathered data is accurate and tailored to the user's preferences, such as destination, group size, and budget constraints.", "Create a clear and concise itinerary that includes: detailed day-by-day travel plan, suggested transportation and accommodation options, activity recommendations (e.g., sightseeing, dining, events), an estimated cost breakdown (covering transportation, accommodation, food, and activities).", "If a particular website or travel option is unavailable, provide alternatives from other trusted sources.", - "Do not include direct links to external websites or booking platforms in the response." + "Do not include direct links to external websites or booking platforms in the response.", ], ) diff --git a/cookbook/playground/multimodal_agent.py b/cookbook/playground/multimodal_agent.py index 959032787..25f040568 100644 --- a/cookbook/playground/multimodal_agent.py +++ b/cookbook/playground/multimodal_agent.py @@ -10,48 +10,83 @@ from phi.model.openai import OpenAIChat from phi.tools.dalle import Dalle from phi.tools.models_labs import ModelsLabs +from phi.model.response import FileType from phi.playground import Playground, serve_playground_app from phi.storage.agent.sqlite import SqlAgentStorage +from phi.tools.fal_tools import FalTools image_agent_storage_file: str = "tmp/image_agent.db" image_agent = Agent( - name="Image Agent", + name="DALL-E Image Agent", agent_id="image_agent", model=OpenAIChat(id="gpt-4o"), tools=[Dalle()], description="You are an AI agent that can generate images using DALL-E.", instructions=[ "When the user asks you to create an image, use the `create_image` tool to create the image.", - "The image will be displayed in the UI automatically below your response, so you don't need to show the image URL in your response.", - "Politely and courteously let the user know that the image has been generated and will be displayed below as soon as its ready.", + "Don't provide the URL of the image in the response. Only describe what image was generated.", ], markdown=True, debug_mode=True, add_history_to_messages=True, add_datetime_to_instructions=True, - storage=SqlAgentStorage(table_name="image_agent", db_file="tmp/image_agent.db"), + storage=SqlAgentStorage(table_name="image_agent", db_file=image_agent_storage_file), ) -video_agent = Agent( - name="Video Agent", - agent_id="video_agent", +ml_gif_agent = Agent( + name="ModelsLab GIF Agent", + agent_id="ml_gif_agent", model=OpenAIChat(id="gpt-4o"), - tools=[ModelsLabs(wait_for_completion=True)], + tools=[ModelsLabs(wait_for_completion=True, file_type=FileType.GIF)], + description="You are an AI agent that can generate gifs using the ModelsLabs API.", + instructions=[ + "When the user asks you to create an image, use the `generate_media` tool to create the image.", + "Don't provide the URL of the image in the response. Only describe what image was generated.", + ], + markdown=True, + debug_mode=True, + add_history_to_messages=True, + add_datetime_to_instructions=True, + storage=SqlAgentStorage(table_name="ml_gif_agent", db_file=image_agent_storage_file), +) + +ml_video_agent = Agent( + name="ModelsLab Video Agent", + agent_id="ml_video_agent", + model=OpenAIChat(id="gpt-4o"), + tools=[ModelsLabs(wait_for_completion=True, file_type=FileType.MP4)], description="You are an AI agent that can generate videos using the ModelsLabs API.", instructions=[ - "When the user asks you to create a video, use the `create_video` tool to create the video.", - "The video will be displayed in the UI automatically below your response, so you don't need to show the video URL in your response.", - "Politely and courteously let the user know that the video has been generated and will be displayed below as soon as its ready.", + "When the user asks you to create a video, use the `generate_media` tool to create the video.", + "Don't provide the URL of the video in the response. Only describe what video was generated.", ], markdown=True, debug_mode=True, add_history_to_messages=True, add_datetime_to_instructions=True, - storage=SqlAgentStorage(table_name="video_agent", db_file="tmp/video_agent.db"), + storage=SqlAgentStorage(table_name="ml_video_agent", db_file=image_agent_storage_file), ) -app = Playground(agents=[image_agent, video_agent]).get_app() +fal_agent = Agent( + name="Fal Video Agent", + agent_id="fal_agent", + model=OpenAIChat(id="gpt-4o"), + tools=[FalTools("fal-ai/hunyuan-video")], + description="You are an AI agent that can generate videos using the Fal API.", + instructions=[ + "When the user asks you to create a video, use the `generate_media` tool to create the video.", + "Don't provide the URL of the video in the response. Only describe what video was generated.", + ], + markdown=True, + debug_mode=True, + add_history_to_messages=True, + add_datetime_to_instructions=True, + storage=SqlAgentStorage(table_name="fal_agent", db_file=image_agent_storage_file), +) + + +app = Playground(agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent]).get_app(use_async=False) if __name__ == "__main__": serve_playground_app("multimodal_agent:app", reload=True) diff --git a/cookbook/tools/lumalabs_tool.py b/cookbook/tools/lumalabs_tool.py new file mode 100644 index 000000000..8d87d31f1 --- /dev/null +++ b/cookbook/tools/lumalabs_tool.py @@ -0,0 +1,45 @@ +from phi.agent import Agent +from phi.llm.openai import OpenAIChat +from phi.tools.lumalab import LumaLabTools + +"""Create an agent specialized for Luma AI video generation""" + +luma_agent = Agent( + name="Luma Video Agent", + agent_id="luma-video-agent", + llm=OpenAIChat(model="gpt-4o"), + tools=[LumaLabTools()], # Using the LumaLab tool we created + markdown=True, + debug_mode=True, + show_tool_calls=True, + instructions=[ + "You are an agent designed to generate videos using the Luma AI API.", + "You can generate videos in two ways:", + "1. Text-to-Video Generation:", + " - Use the generate_video function for creating videos from text prompts", + " - Default parameters: loop=False, aspect_ratio='16:9', keyframes=None", + "2. Image-to-Video Generation:", + " - Use the image_to_video function when starting from one or two images", + " - Required parameters: prompt, start_image_url", + " - Optional parameters: end_image_url, loop=False, aspect_ratio='16:9'", + " - The image URLs must be publicly accessible", + "Choose the appropriate function based on whether the user provides image URLs or just a text prompt.", + "The video will be displayed in the UI automatically below your response, so you don't need to show the video URL in your response.", + "Politely and courteously let the user know that the video has been generated and will be displayed below as soon as its ready.", + "After generating any video, if generation is async (wait_for_completion=False), inform about the generation ID", + ], + system_message=( + "Use generate_video for text-to-video requests and image_to_video for image-based " + "generation. Don't modify default parameters unless specifically requested. " + "Always provide clear feedback about the video generation status." + ), +) + +luma_agent.run("Generate a video of a car in a sky") +# luma_agent.run("Transform this image into a video of a tiger walking: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Walking_tiger_female.jpg/1920px-Walking_tiger_female.jpg") +# luma_agent.run(""" +# Create a transition video between these two images: +# Start: https://img.freepik.com/premium-photo/car-driving-dark-forest-generative-ai_634053-6661.jpg?w=1380 +# End: https://img.freepik.com/free-photo/front-view-black-luxury-sedan-road_114579-5030.jpg?t=st=1733821884~exp=1733825484~hmac=735ca584a9b985c53875fc1ad343c3fd394e1de4db49e5ab1a9ab37ac5f91a36&w=1380 +# Make it a smooth, natural movement +# """) diff --git a/cookbook/vectordb/qdrant_db.py b/cookbook/vectordb/qdrant_db.py index 10d2ee8eb..35b4f542a 100644 --- a/cookbook/vectordb/qdrant_db.py +++ b/cookbook/vectordb/qdrant_db.py @@ -1,4 +1,4 @@ -# pip install qdrant-client +# pip install qdrant-client from phi.vectordb.qdrant import Qdrant from phi.agent import Agent from phi.knowledge.pdf import PDFUrlKnowledgeBase @@ -13,10 +13,7 @@ """ COLLECTION_NAME = "thai-recipes" -vector_db = Qdrant( - collection=COLLECTION_NAME, - url="http://localhost:6333" -) +vector_db = Qdrant(collection=COLLECTION_NAME, url="http://localhost:6333") knowledge_base = PDFUrlKnowledgeBase( urls=["https://phi-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"], @@ -27,4 +24,4 @@ # Create and use the agent agent = Agent(knowledge_base=knowledge_base, use_tools=True, show_tool_calls=True) -agent.print_response("List down the ingredients to make Massaman Gai", markdown=True) \ No newline at end of file +agent.print_response("List down the ingredients to make Massaman Gai", markdown=True) diff --git a/phi/agent/agent.py b/phi/agent/agent.py index f623f5a58..89cc3ea00 100644 --- a/phi/agent/agent.py +++ b/phi/agent/agent.py @@ -28,6 +28,7 @@ from phi.document import Document from phi.agent.session import AgentSession +from phi.model.content import Image, Video from phi.reasoning.step import ReasoningStep, ReasoningSteps, NextAction from phi.run.response import RunEvent, RunResponse, RunResponseExtraData from phi.knowledge.agent import AgentKnowledge @@ -57,9 +58,9 @@ class Agent(BaseModel): # -*- Agent Data # Images associated with this agent - images: Optional[List[Union[str, Dict[str, Any]]]] = None + images: Optional[List[Image]] = None # Videos associated with this agent - videos: Optional[List[Union[str, Dict[str, Any]]]] = None + videos: Optional[List[Video]] = None # Data associated with this agent # name, model, images and videos are automatically added to the agent_data @@ -573,9 +574,9 @@ def get_agent_data(self) -> Dict[str, Any]: if self.model is not None: agent_data["model"] = self.model.to_dict() if self.images is not None: - agent_data["images"] = self.images + agent_data["images"] = [img if isinstance(img, dict) else img.model_dump() for img in self.images] if self.videos is not None: - agent_data["videos"] = self.videos + agent_data["videos"] = [vid if isinstance(vid, dict) else vid.model_dump() for vid in self.videos] return agent_data def get_session_data(self) -> Dict[str, Any]: @@ -588,7 +589,6 @@ def get_session_data(self) -> Dict[str, Any]: def get_agent_session(self) -> AgentSession: """Get an AgentSession object, which can be saved to the database""" - return AgentSession( session_id=self.session_id, agent_id=self.agent_id, @@ -632,13 +632,13 @@ def from_agent_session(self, session: AgentSession): if "images" in session.agent_data: images_from_db = session.agent_data.get("images") if self.images is not None and isinstance(self.images, list): - self.images.extend(images_from_db) # type: ignore + self.images.extend([Image.model_validate(img) for img in self.images]) else: self.images = images_from_db if "videos" in session.agent_data: videos_from_db = session.agent_data.get("videos") if self.videos is not None and isinstance(self.videos, list): - self.videos.extend(videos_from_db) # type: ignore + self.videos.extend([Video.model_validate(vid) for vid in self.videos]) else: self.videos = videos_from_db @@ -2433,7 +2433,7 @@ def delete_session(self, session_id: str): # Handle images and videos ########################################################################### - def add_image(self, image: Union[str, Dict]) -> None: + def add_image(self, image: Image) -> None: if self.images is None: self.images = [] self.images.append(image) @@ -2442,7 +2442,7 @@ def add_image(self, image: Union[str, Dict]) -> None: self.run_response.images = [] self.run_response.images.append(image) - def add_video(self, video: Union[str, Dict]) -> None: + def add_video(self, video: Video) -> None: if self.videos is None: self.videos = [] self.videos.append(video) @@ -2451,10 +2451,10 @@ def add_video(self, video: Union[str, Dict]) -> None: self.run_response.videos = [] self.run_response.videos.append(video) - def get_images(self) -> Optional[List[Union[str, Dict]]]: + def get_images(self) -> Optional[List[Image]]: return self.images - def get_videos(self) -> Optional[List[Union[str, Dict]]]: + def get_videos(self) -> Optional[List[Video]]: return self.videos ########################################################################### diff --git a/phi/llm/openai/chat.py b/phi/llm/openai/chat.py index 037e548d4..666313522 100644 --- a/phi/llm/openai/chat.py +++ b/phi/llm/openai/chat.py @@ -181,7 +181,9 @@ def to_dict(self) -> Dict[str, Any]: if self.presence_penalty: _dict["presence_penalty"] = self.presence_penalty if self.response_format: - _dict["response_format"] = self.response_format + _dict["response_format"] = ( + self.response_format if isinstance(self.response_format, dict) else str(self.response_format) + ) if self.seed is not None: _dict["seed"] = self.seed if self.stop: diff --git a/phi/model/content.py b/phi/model/content.py new file mode 100644 index 000000000..65a50650e --- /dev/null +++ b/phi/model/content.py @@ -0,0 +1,18 @@ +from typing import Optional + +from pydantic import BaseModel + + +class Video(BaseModel): + id: str + url: str + original_prompt: Optional[str] = None + revised_prompt: Optional[str] = None + eta: Optional[str] = None + + +class Image(BaseModel): + id: str + url: str + original_prompt: Optional[str] = None + revised_prompt: Optional[str] = None diff --git a/phi/model/openai/chat.py b/phi/model/openai/chat.py index ba916bf37..95c20bc17 100644 --- a/phi/model/openai/chat.py +++ b/phi/model/openai/chat.py @@ -255,7 +255,9 @@ def to_dict(self) -> Dict[str, Any]: if self.presence_penalty is not None: model_dict["presence_penalty"] = self.presence_penalty if self.response_format is not None: - model_dict["response_format"] = self.response_format + model_dict["response_format"] = ( + self.response_format if isinstance(self.response_format, dict) else str(self.response_format) + ) if self.seed is not None: model_dict["seed"] = self.seed if self.stop is not None: diff --git a/phi/model/response.py b/phi/model/response.py index 9fccf3df6..619c96073 100644 --- a/phi/model/response.py +++ b/phi/model/response.py @@ -23,3 +23,8 @@ class ModelResponse: tool_call: Optional[Dict[str, Any]] = None event: str = ModelResponseEvent.assistant_response.value created_at: int = int(time()) + + +class FileType(str, Enum): + MP4 = "mp4" + GIF = "gif" diff --git a/phi/run/response.py b/phi/run/response.py index b486ea935..a711d66dc 100644 --- a/phi/run/response.py +++ b/phi/run/response.py @@ -1,9 +1,10 @@ from time import time from enum import Enum -from typing import Optional, Any, Dict, List, Union +from typing import Optional, Any, Dict, List from pydantic import BaseModel, ConfigDict, Field +from phi.model.content import Video, Image from phi.reasoning.step import ReasoningStep from phi.model.message import Message, MessageReferences @@ -48,8 +49,8 @@ class RunResponse(BaseModel): session_id: Optional[str] = None workflow_id: Optional[str] = None tools: Optional[List[Dict[str, Any]]] = None - images: Optional[List[Union[str, Dict[str, Any]]]] = None - videos: Optional[List[Union[str, Dict[str, Any]]]] = None + images: Optional[List[Image]] = None + videos: Optional[List[Video]] = None audio: Optional[Dict] = None extra_data: Optional[RunResponseExtraData] = None created_at: int = Field(default_factory=lambda: int(time())) diff --git a/phi/tools/dalle.py b/phi/tools/dalle.py index 662239573..52258419c 100644 --- a/phi/tools/dalle.py +++ b/phi/tools/dalle.py @@ -1,7 +1,9 @@ from os import getenv from typing import Optional, Literal +from uuid import uuid4 from phi.agent import Agent +from phi.model.content import Image from phi.tools import Toolkit from phi.utils.log import logger @@ -80,7 +82,10 @@ def create_image(self, agent: Agent, prompt: str) -> str: logger.debug("Image generated successfully") # Update the run response with the image URLs - agent.add_image(response.model_dump()) + for img in response.data: + agent.add_image( + Image(id=str(uuid4()), url=img.url, original_prompt=prompt, revised_prompt=img.revised_prompt) + ) return "Image has been generated successfully and will be displayed below" except Exception as e: logger.error(f"Failed to generate image: {e}") diff --git a/phi/tools/fal_tools.py b/phi/tools/fal_tools.py new file mode 100644 index 000000000..e51eb0926 --- /dev/null +++ b/phi/tools/fal_tools.py @@ -0,0 +1,88 @@ +""" +pip install fal-client +""" + +from os import getenv +from typing import Optional + +from phi.agent import Agent +from phi.tools import Toolkit +from phi.utils.log import logger +from phi.model.content import Video, Image +from uuid import uuid4 + + +try: + import fal_client # type: ignore +except ImportError: + raise ImportError("`fal_client` not installed. Please install using `pip install fal-client`") + + +class FalTools(Toolkit): + def __init__( + self, + api_key: Optional[str] = None, + model: str = "fal-ai/hunyuan-video", + ): + super().__init__(name="fal") + + self.api_key = api_key or getenv("FAL_KEY") + self.model = model + if not self.api_key: + logger.error("FAL_KEY not set. Please set the FAL_KEY environment variable.") + self.seen_logs: set[str] = set() + self.register(self.generate_media) + + def on_queue_update(self, update): + if isinstance(update, fal_client.InProgress) and update.logs: + for log in update.logs: + message = log["message"] + if message not in self.seen_logs: + logger.info(message) + self.seen_logs.add(message) + + def generate_media(self, agent: Agent, prompt: str) -> str: + """ + Use this function to run a model with a given prompt. + + Args: + prompt (str): A text description of the task. + Returns: + str: Return the result of the model. + """ + try: + result = fal_client.subscribe( + self.model, + arguments={"prompt": prompt}, + with_logs=True, + on_queue_update=self.on_queue_update, + ) + + media_id = str(uuid4()) + + if "image" in result: + url = result.get("image", {}).get("url", "") + agent.add_image( + Image( + id=media_id, + url=url, + ) + ) + media_type = "image" + elif "video" in result: + url = result.get("video", {}).get("url", "") + agent.add_video( + Video( + id=media_id, + url=url, + ) + ) + media_type = "video" + else: + logger.error(f"Unsupported type in result: {result}") + return f"Unsupported type in result: {result}" + + return f"{media_type.capitalize()} generated successfully at {url}" + except Exception as e: + logger.error(f"Failed to run model: {e}") + return f"Error: {e}" diff --git a/phi/tools/lumalab.py b/phi/tools/lumalab.py new file mode 100644 index 000000000..bebb2b652 --- /dev/null +++ b/phi/tools/lumalab.py @@ -0,0 +1,168 @@ +import time +import uuid +from os import getenv +from typing import Optional, Dict, Any, Literal, TypedDict + +from phi.agent import Agent +from phi.tools import Toolkit +from phi.utils.log import logger +from phi.model.content import Video + +try: + from lumaai import LumaAI # type: ignore +except ImportError: + raise ImportError("`lumaai` not installed. Please install using `pip install lumaai`") + + +# Define types for keyframe structure +class KeyframeImage(TypedDict): + type: Literal["image"] + url: str + + +Keyframes = Dict[str, KeyframeImage] + + +class LumaLabTools(Toolkit): + def __init__( + self, + api_key: Optional[str] = None, + wait_for_completion: bool = True, + poll_interval: int = 3, + max_wait_time: int = 300, # 5 minutes + ): + super().__init__(name="luma_lab") + + self.wait_for_completion = wait_for_completion + self.poll_interval = poll_interval + self.max_wait_time = max_wait_time + self.api_key = api_key or getenv("LUMAAI_API_KEY") + + if not self.api_key: + logger.error("LUMAAI_API_KEY not set. Please set the LUMAAI_API_KEY environment variable.") + + self.client = LumaAI(auth_token=self.api_key) + self.register(self.generate_video) + self.register(self.image_to_video) + + def image_to_video( + self, + agent: Agent, + prompt: str, + start_image_url: str, + end_image_url: Optional[str] = None, + loop: bool = False, + aspect_ratio: Literal["1:1", "16:9", "9:16", "4:3", "3:4", "21:9", "9:21"] = "16:9", + ) -> str: + """Generate a video from one or two images with a prompt. + + Args: + agent: The agent instance + prompt: Text description of the desired video + start_image_url: URL of the starting image + end_image_url: Optional URL of the ending image + loop: Whether the video should loop + aspect_ratio: Aspect ratio of the output video + + Returns: + str: Status message or error + """ + + try: + # Construct keyframes + keyframes: Dict[str, Dict[str, str]] = {"frame0": {"type": "image", "url": start_image_url}} + + # Add end image if provided + if end_image_url: + keyframes["frame1"] = {"type": "image", "url": end_image_url} + + # Create generation with keyframes + generation = self.client.generations.create( + prompt=prompt, + loop=loop, + aspect_ratio=aspect_ratio, + keyframes=keyframes, # type: ignore + ) + + video_id = str(uuid.uuid4()) + + if not self.wait_for_completion: + return "Async generation unsupported" + + # Poll for completion + seconds_waited = 0 + while seconds_waited < self.max_wait_time: + if not generation or not generation.id: + return "Failed to get generation ID" + + generation = self.client.generations.get(generation.id) + + if generation.state == "completed" and generation.assets: + video_url = generation.assets.video + if video_url: + agent.add_video(Video(id=video_id, url=video_url, eta="completed")) + return f"Video generated successfully: {video_url}" + elif generation.state == "failed": + return f"Generation failed: {generation.failure_reason}" + + logger.info(f"Generation in progress... State: {generation.state}") + time.sleep(self.poll_interval) + seconds_waited += self.poll_interval + + return f"Video generation timed out after {self.max_wait_time} seconds" + + except Exception as e: + logger.error(f"Failed to generate video: {e}") + return f"Error: {e}" + + def generate_video( + self, + agent: Agent, + prompt: str, + loop: bool = False, + aspect_ratio: Literal["1:1", "16:9", "9:16", "4:3", "3:4", "21:9", "9:21"] = "16:9", + keyframes: Optional[Dict[str, Dict[str, str]]] = None, + ) -> str: + """Use this function to generate a video given a prompt.""" + + try: + generation_params: Dict[str, Any] = { + "prompt": prompt, + "loop": loop, + "aspect_ratio": aspect_ratio, + } + + if keyframes is not None: + generation_params["keyframes"] = keyframes + + generation = self.client.generations.create(**generation_params) # type: ignore + + video_id = str(uuid.uuid4()) + if not self.wait_for_completion: + return "Async generation unsupported" + + # Poll for completion + seconds_waited = 0 + while seconds_waited < self.max_wait_time: + if not generation or not generation.id: + return "Failed to get generation ID" + + generation = self.client.generations.get(generation.id) + + if generation.state == "completed" and generation.assets: + video_url = generation.assets.video + if video_url: + agent.add_video(Video(id=video_id, url=video_url, state="completed")) + return f"Video generated successfully: {video_url}" + elif generation.state == "failed": + return f"Generation failed: {generation.failure_reason}" + + logger.info(f"Generation in progress... State: {generation.state}") + time.sleep(self.poll_interval) + seconds_waited += self.poll_interval + + return f"Video generation timed out after {self.max_wait_time} seconds" + + except Exception as e: + logger.error(f"Failed to generate video: {e}") + return f"Error: {e}" diff --git a/phi/tools/models_labs.py b/phi/tools/models_labs.py index a6142aec0..c32a58ac8 100644 --- a/phi/tools/models_labs.py +++ b/phi/tools/models_labs.py @@ -2,8 +2,11 @@ import json from os import getenv from typing import Optional +from uuid import uuid4 from phi.agent import Agent +from phi.model.content import Video, Image +from phi.model.response import FileType from phi.tools import Toolkit from phi.utils.log import logger @@ -25,6 +28,7 @@ def __init__( add_to_eta: int = 15, # Maximum time to wait for the video to be ready max_wait_time: int = 60, + file_type: FileType = FileType.MP4, ): super().__init__(name="models_labs") @@ -33,13 +37,14 @@ def __init__( self.wait_for_completion = wait_for_completion self.add_to_eta = add_to_eta self.max_wait_time = max_wait_time + self.file_type = file_type self.api_key = api_key or getenv("MODELS_LAB_API_KEY") if not self.api_key: logger.error("MODELS_LAB_API_KEY not set. Please set the MODELS_LAB_API_KEY environment variable.") - self.register(self.generate_video) + self.register(self.generate_media) - def generate_video(self, agent: Agent, prompt: str) -> str: + def generate_media(self, agent: Agent, prompt: str) -> str: """Use this function to generate a video given a prompt. Args: @@ -60,7 +65,7 @@ def generate_video(self, agent: Agent, prompt: str) -> str: "width": 512, "num_frames": 25, "webhook": None, - "output_type": "gif", + "output_type": self.file_type.value, "track_id": None, "negative_prompt": "low quality", "model_id": "cogvideox", @@ -79,25 +84,18 @@ def generate_video(self, agent: Agent, prompt: str) -> str: return f"Error: {result['error']}" eta = result["eta"] - video_url_links = result["future_links"] - video_id = result["id"] - logger.info(f"Video will be ready in {eta} seconds") - logger.info(f"Video URLs: {video_url_links}") - - video_data = [] - for video_url in video_url_links: - video_data.append( - { - "eta": eta, - "video_id": video_id, - "url": video_url, - } - ) - result["data"] = video_data - logger.debug(f"Result: {result}") + url_links = result["future_links"] + logger.info(f"Media will be ready in {eta} seconds") + logger.info(f"Media URLs: {url_links}") + + video_id = str(uuid4()) - # Update the run response with the image URLs - agent.add_video(json.dumps(result)) + logger.debug(f"Result: {result}") + for media_url in url_links: + if self.file_type == FileType.MP4: + agent.add_video(Video(id=str(video_id), url=media_url, eta=str(eta))) + elif self.file_type == FileType.GIF: + agent.add_image(Image(id=str(video_id), url=media_url)) if self.wait_for_completion and isinstance(eta, int): video_ready = False diff --git a/phi/tools/replicate.py b/phi/tools/replicate.py new file mode 100644 index 000000000..7d5fb3e16 --- /dev/null +++ b/phi/tools/replicate.py @@ -0,0 +1,72 @@ +import os +from os import getenv +from urllib.parse import urlparse +from uuid import uuid4 + +from phi.agent import Agent +from phi.model.content import Video, Image +from phi.tools import Toolkit +from phi.utils.log import logger + +try: + import replicate + from replicate.helpers import FileOutput +except ImportError: + raise ImportError("`replicate` not installed. Please install using `pip install replicate`.") + + +class ReplicateTools(Toolkit): + def __init__( + self, + model: str = "minimax/video-01", + ): + super().__init__(name="replicate_toolkit") + self.api_key = getenv("REPLICATE_API_TOKEN") + if not self.api_key: + logger.error("REPLICATE_API_TOKEN not set. Please set the REPLICATE_API_TOKEN environment variable.") + self.model = model + self.register(self.generate_media) + + def generate_media(self, agent: Agent, prompt: str) -> str: + """ + Use this function to generate an image or a video using a replicate model. + Args: + prompt (str): A text description of the content. + Returns: + str: Return a URI to the generated video or image. + """ + output: FileOutput = replicate.run(ref=self.model, input={"prompt": prompt}) + + # Parse the URL to extract the file extension + parsed_url = urlparse(output.url) + path = parsed_url.path + _, ext = os.path.splitext(path) + ext = ext.lower() + + # Define supported extensions + image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"} + video_extensions = {".mp4", ".mov", ".avi", ".mkv", ".flv", ".wmv", ".webm"} + + media_id = str(uuid4()) + + if ext in image_extensions: + agent.add_image( + Image( + id=media_id, + url=output.url, + ) + ) + media_type = "image" + elif ext in video_extensions: + agent.add_video( + Video( + id=media_id, + url=output.url, + ) + ) + media_type = "video" + else: + logger.error(f"Unsupported media type with extension '{ext}' for URL: {output.url}") + return f"Unsupported media type with extension '{ext}'." + + return f"{media_type.capitalize()} generated successfully at {output.url}" diff --git a/phi/workspace/settings.py b/phi/workspace/settings.py index a8a37dc65..b7a0845f8 100644 --- a/phi/workspace/settings.py +++ b/phi/workspace/settings.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Optional, List, Dict -from pydantic import field_validator, ValidationInfo +from pydantic import field_validator, ValidationInfo, Field from pydantic_settings import BaseSettings, SettingsConfigDict from phi.api.schemas.workspace import WorkspaceSchema @@ -117,9 +117,9 @@ class WorkspaceSettings(BaseSettings): aws_az4: Optional[str] = None aws_az5: Optional[str] = None # Public subnets. 1 in each AZ. - public_subnets: List[str] = [] + public_subnets: List[str] = Field(default_factory=list) # Private subnets. 1 in each AZ. - private_subnets: List[str] = [] + private_subnets: List[str] = Field(default_factory=list) # Subnet IDs. 1 in each AZ. # Derived from public and private subnets if not provided. subnet_ids: Optional[List[str]] = None diff --git a/pyproject.toml b/pyproject.toml index 6bd10c075..d3dde82f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "phidata" -version = "2.6.7" +version = "2.7.0" description = "Build multi-modal Agents with memory, knowledge and tools." requires-python = ">=3.7" readme = "README.md" @@ -145,6 +145,7 @@ module = [ "pytz.*", "qdrant_client.*", "rapidocr_onnxruntime.*", + "replicate.*", "requests.*", "sentence_transformers.*", "serpapi.*",