block · AaronGoldsmith · Nov 16, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/packages/exchange/src/exchange/content.py b/packages/exchange/src/exchange/content.py
@@ -18,6 +18,18 @@ def to_dict(self) -> dict[str, any]:
         return data
 
 
+@define
+class ImageUrl(Content):
+    url: str
+
+    def to_dict(self) -> dict[str, any]:
+        return {"type": "image_url", "image_url": {"url": self.url}}
+
+    @property
+    def summary(self) -> str:
+        return f"[Image: {self.url}]"
+
+
 @define
 class Text(Content):
     text: str

diff --git a/packages/exchange/src/exchange/message.py b/packages/exchange/src/exchange/message.py
@@ -6,16 +6,16 @@
 from attrs import define, field
 from jinja2 import Environment, FileSystemLoader
 
-from exchange.content import CONTENT_TYPES, Content, Text, ToolResult, ToolUse
+from exchange.content import CONTENT_TYPES, Content, Text, ToolResult, ToolUse, ImageUrl
 from exchange.utils import create_object_id
 
 Role = Literal["user", "assistant"]
 
 
 def validate_role_and_content(instance: "Message", *_: any) -> None:  # noqa: ANN401
     if instance.role == "user":
-        if not (instance.text or instance.tool_result):
-            raise ValueError("User message must include a Text or ToolResult")
+        if not (instance.text or instance.tool_result or instance.image_content):
+            raise ValueError("User message must include a Text or ToolResult or ImageUrl")
         if instance.tool_use:
             raise ValueError("User message does not support ToolUse")
     elif instance.role == "assistant":
@@ -25,9 +25,21 @@ def validate_role_and_content(instance: "Message", *_: any) -> None:  # noqa: AN
             raise ValueError("Assistant message does not support ToolResult")
 
 
-def content_converter(contents: list[dict[str, any]]) -> list[Content]:
-    return [(CONTENT_TYPES[c.pop("type")](**c) if c.__class__ not in CONTENT_TYPES.values() else c) for c in contents]
-
+def content_converter(contents: list) -> list[Content]:                                                                                                                      
+     result = []                                                                                                                                                              
+     for c in contents:                                                                                                                                                       
+         if isinstance(c, dict) and 'type' in c:  # Structured content logic                                                                                                  
+             content_type = c.pop('type')                                                                                                                                     
+             if content_type in CONTENT_TYPES:                                                                                                                                
+                 result.append(CONTENT_TYPES[content_type](**c))                                                                                                              
+         elif isinstance(c, Content):  # Already a Content instance                                                                                                           
+             result.append(c)                                                                                                                                                 
+         elif isinstance(c, str):  # Plain text handling                                                                                                                      
+             result.append(Text(text=c))                                                                                                                                      
+         else:                                                                                                                                                                
+             # Handle unexpected content formats if necessary                                                                                                                 
+             raise ValueError(f"Unsupported content type: {type(c)}")                                                                                                         
+     return result       
 
 @define
 class Message:
@@ -57,6 +69,15 @@ def to_dict(self) -> dict[str, any]:
             "created": self.created,
             "content": [item.to_dict() for item in self.content],
         }
+
+    @property
+    def image_content(self) -> list[ImageUrl]:
+        """All images in this message."""
+        result = []
+        for content in self.content:
+            if isinstance(content, ImageUrl):
+                result.append(content)
+        return result
 
     @property
     def text(self) -> str:

diff --git a/packages/exchange/src/exchange/providers/utils.py b/packages/exchange/src/exchange/providers/utils.py
@@ -4,7 +4,7 @@
 from typing import Optional
 
 import httpx
-from exchange.content import Text, ToolResult, ToolUse
+from exchange.content import Text, ToolResult, ToolUse, ImageUrl
 from exchange.message import Message
 from exchange.tool import Tool
 from tenacity import retry_if_exception
@@ -99,7 +99,13 @@ def messages_to_openai_spec(messages: list[Message]) -> list[dict[str, any]]:
                             "tool_call_id": content.tool_use_id,
                         }
                     )
-
+            elif isinstance(content, ImageUrl):
+                output.append(
+                    {
+                        "role": "user",
+                        "content": [content.to_dict()]
+                    }
+                )
         if "content" in converted or "tool_calls" in converted:
             output = [converted] + output
         messages_spec.extend(output)

diff --git a/packages/exchange/tests/test_image_tool_integration.py b/packages/exchange/tests/test_image_tool_integration.py
@@ -0,0 +1,61 @@
+# import pytest
+import base64
+from exchange.exchange import Exchange
+from exchange.message import Message
+from exchange.content import ImageUrl
+from exchange.providers.openai import OpenAiProvider
+import httpx
+
+
+def test_describe_image_url():
+    image_url = ImageUrl(url="https://picsum.photos/id/1/200/200")
+    user_message = Message(role="user", content=["Describe the image: ", image_url])
+    ex = Exchange(
+        provider=OpenAiProvider.from_env(),
+        model="gpt-4o",
+        system="You are a helpful assistant.",
+        tools=[],
+    )
+    ex.add(user_message)
+    res = ex.reply()
+    assert "laptop" in res.content[0].text
+
+
+def get_lorem_picsum_base64(url):
+    image_response = httpx.get(url, follow_redirects=True)
+    if image_response.is_redirect:
+        redirect_url = image_response.headers["Location"]
+        if "fastly.picsum.photos" in redirect_url:
+            # Follow redirects to fastly
+            image_response = httpx.get(redirect_url)
+
+    image_content = image_response.content
+    base_64_encoded_image = (
+        f'data:image/jpeg;base64,{base64.standard_b64encode(image_content).decode("utf-8")}'
+    )
+    return base_64_encoded_image
+
+
+def test_image_comparison_url_and_data():
+
+    base_64_image_data = get_lorem_picsum_base64("https://picsum.photos/id/1/200/200")
+    base_64_image_url = ImageUrl(url=base_64_image_data)
+
+    user_message = Message(
+        role="user",
+        content=[
+            "Are these images the same? ",
+            ImageUrl(url=base_64_image_url),
+            ImageUrl(url="https://picsum.photos/id/1/200/200"),
+        ],
+    )
+    ex = Exchange(
+        provider=OpenAiProvider.from_env(),
+        model="gpt-4o",
+        system="Reply with yes or no.",
+        tools=[],
+    )
+    ex.add(user_message)
+    res = ex.reply()
+    # The assertion below is based on comparing two identical URLs.
+    assert "yes" in res.content[0].text.lower()
diff --git a/packages/exchange/tests/test_message.py b/packages/exchange/tests/test_message.py
@@ -3,7 +3,7 @@
 import pytest
 
 from exchange.message import Message
-from exchange.content import Text, ToolUse, ToolResult
+from exchange.content import Text, ToolUse, ToolResult, ImageUrl
 
 
 def test_user_message():
@@ -38,6 +38,16 @@ def test_message_tool_result():
     assert message.tool_result[0].output == "result"
 
 
+def test_message_image_content():
+    img1 = ImageUrl(url="https://picsum.photos/id/1/200/200")
+    img2 = ImageUrl(url="https://picsum.photos/id/200/200/200")
+    message = Message(role="user", content=["here are two images", img1, img2])
+
+    assert len(message.image_content) == 2
+    assert message.image_content[0].url == "https://picsum.photos/id/1/200/200"
+    assert message.image_content[1].url == "https://picsum.photos/id/200/200/200"
+
+
 def test_message_load(tmpdir):
     # To emulate the expected relative lookup, we need to create a mock code dir
     # and run the load in a subprocess

diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ synopsis = "goose.synopsis.toolkit:SynopsisDeveloper"
 browser = "goose.toolkit.web_browser:BrowserToolkit"
 memory = "goose.toolkit.memory:Memory"
 google_workspace = "goose.toolkit.google_workspace:GoogleWorkspace"
+vision = "goose.toolkit.vision:VisionToolkit"
 
 [project.entry-points."goose.profile"]
 default = "goose.profile:default_profile"

diff --git a/src/goose/toolkit/vision.py b/src/goose/toolkit/vision.py
@@ -0,0 +1,40 @@
+import os
+from exchange.content import ImageUrl
+from exchange.providers.utils import encode_image
+from exchange.exchange import Exchange
+from exchange.message import Message
+from exchange.providers.openai import OpenAiProvider
+from goose.toolkit.base import Toolkit, tool
+
+class VisionToolkit(Toolkit):
+    """A toolkit for image analysis using AI capabilities."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @tool
+    def describe_image(self, image: str, instructions: str="Describe the image") -> str:
+        """Analyze an image and return a description or other analysis based on the instructions.
+
+        Args:
+            image (ImageUrl): The URL or base64 encoded image to analyze.
+            instructions (str): Instructions for the AI on what kind of analysis to perform.
+        """
+        if os.path.isfile(image):
+            encoded_image = encode_image(image)
+            image = f"data:image/jpeg;base64,{encoded_image}"
+
+        image = ImageUrl(url=image)
+        user_message = Message(role="user", content=[f"{instructions}: ", image])
+        exchange = Exchange(
+            provider=OpenAiProvider.from_env(),
+            model="gpt-4o-mini",
+            system="You are a helpful assistant.",
+            messages=[user_message],
+            tools=[],
+        )
+        assistant_response = exchange.reply()
+        return assistant_response.content[0].text
+
+    def system(self) -> str:
+        return """This toolkit allows you to visually analyze images using AI capabilities."""