Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: openai vision toolkit #269

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions packages/exchange/src/exchange/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@ def to_dict(self) -> dict[str, any]:
return data


@define
class ImageUrl(Content):
url: str

def to_dict(self) -> dict[str, any]:
return {"type": "image_url", "image_url": {"url": self.url}}

@property
def summary(self) -> str:
return f"[Image: {self.url}]"


@define
class Text(Content):
text: str
Expand Down
33 changes: 27 additions & 6 deletions packages/exchange/src/exchange/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@
from attrs import define, field
from jinja2 import Environment, FileSystemLoader

from exchange.content import CONTENT_TYPES, Content, Text, ToolResult, ToolUse
from exchange.content import CONTENT_TYPES, Content, Text, ToolResult, ToolUse, ImageUrl
from exchange.utils import create_object_id

Role = Literal["user", "assistant"]


def validate_role_and_content(instance: "Message", *_: any) -> None: # noqa: ANN401
if instance.role == "user":
if not (instance.text or instance.tool_result):
raise ValueError("User message must include a Text or ToolResult")
if not (instance.text or instance.tool_result or instance.image_content):
raise ValueError("User message must include a Text or ToolResult or ImageUrl")
if instance.tool_use:
raise ValueError("User message does not support ToolUse")
elif instance.role == "assistant":
Expand All @@ -25,9 +25,21 @@ def validate_role_and_content(instance: "Message", *_: any) -> None: # noqa: AN
raise ValueError("Assistant message does not support ToolResult")


def content_converter(contents: list[dict[str, any]]) -> list[Content]:
return [(CONTENT_TYPES[c.pop("type")](**c) if c.__class__ not in CONTENT_TYPES.values() else c) for c in contents]

def content_converter(contents: list) -> list[Content]:
result = []
for c in contents:
if isinstance(c, dict) and 'type' in c: # Structured content logic
content_type = c.pop('type')
if content_type in CONTENT_TYPES:
result.append(CONTENT_TYPES[content_type](**c))
elif isinstance(c, Content): # Already a Content instance
result.append(c)
elif isinstance(c, str): # Plain text handling
result.append(Text(text=c))
else:
# Handle unexpected content formats if necessary
raise ValueError(f"Unsupported content type: {type(c)}")
return result
Comment on lines +28 to +42
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before updating content_converter I would get the following:

 "Traceback (most recent call last):
  File \"/Users/aarong/Development/goose/packages/exchange/src/exchange/exchange.py\", line 149, in call_function
    output = json.dumps(tool.function(**tool_use.parameters))
 File \"/Users/aarong/Development/goose/src/goose/toolkit/vision.py\", line 22, in describe_image
    user_message = Message(role=\"user\", content=[f\"{instructions}: \", image])
  File \"<attrs generated init exchange.message.Message>\", line 13, in __init__

    _setattr('content', __attr_converter_content(content))
  File \"/Users/aarong/Development/goose/packages/exchange/src/exchange/message.py\", line 29, in content_converter\n    return [(CONTENT_TYPES[c.pop(\"type\")](**c) if c.__class__ not in CONTENT_TYPES.values() else c) for c in contents]

  File \"/Users/aarong/Development/goose/packages/exchange/src/exchange/message.py\", line 29, in <listcomp>\n    return [(CONTENT_TYPES[c.pop(\"type\")](**c) if c.__class__ not in CONTENT_TYPES.values() else c) for c in contents]\nAttributeError: 'str' object has no attribute 'pop'

'str' object has no attribute 'pop'", "is_error": true, "type": "ToolResult"}]

image


@define
class Message:
Expand Down Expand Up @@ -57,6 +69,15 @@ def to_dict(self) -> dict[str, any]:
"created": self.created,
"content": [item.to_dict() for item in self.content],
}

@property
def image_content(self) -> list[ImageUrl]:
"""All images in this message."""
result = []
for content in self.content:
if isinstance(content, ImageUrl):
result.append(content)
return result

@property
def text(self) -> str:
Expand Down
10 changes: 8 additions & 2 deletions packages/exchange/src/exchange/providers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Optional

import httpx
from exchange.content import Text, ToolResult, ToolUse
from exchange.content import Text, ToolResult, ToolUse, ImageUrl
from exchange.message import Message
from exchange.tool import Tool
from tenacity import retry_if_exception
Expand Down Expand Up @@ -99,7 +99,13 @@ def messages_to_openai_spec(messages: list[Message]) -> list[dict[str, any]]:
"tool_call_id": content.tool_use_id,
}
)

elif isinstance(content, ImageUrl):
output.append(
{
"role": "user",
"content": [content.to_dict()]
}
)
if "content" in converted or "tool_calls" in converted:
output = [converted] + output
messages_spec.extend(output)
Expand Down
61 changes: 61 additions & 0 deletions packages/exchange/tests/test_image_tool_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# import pytest
import base64
from exchange.exchange import Exchange
from exchange.message import Message
from exchange.content import ImageUrl
from exchange.providers.openai import OpenAiProvider
import httpx


def test_describe_image_url():
image_url = ImageUrl(url="https://picsum.photos/id/1/200/200")
user_message = Message(role="user", content=["Describe the image: ", image_url])
ex = Exchange(
provider=OpenAiProvider.from_env(),
model="gpt-4o",
system="You are a helpful assistant.",
tools=[],
)
ex.add(user_message)
res = ex.reply()
assert "laptop" in res.content[0].text


def get_lorem_picsum_base64(url):
image_response = httpx.get(url, follow_redirects=True)
if image_response.is_redirect:
redirect_url = image_response.headers["Location"]
if "fastly.picsum.photos" in redirect_url:
# Follow redirects to fastly
image_response = httpx.get(redirect_url)

image_content = image_response.content
base_64_encoded_image = (
f'data:image/jpeg;base64,{base64.standard_b64encode(image_content).decode("utf-8")}'
)
return base_64_encoded_image


def test_image_comparison_url_and_data():

base_64_image_data = get_lorem_picsum_base64("https://picsum.photos/id/1/200/200")
base_64_image_url = ImageUrl(url=base_64_image_data)

user_message = Message(
role="user",
content=[
"Are these images the same? ",
ImageUrl(url=base_64_image_url),
ImageUrl(url="https://picsum.photos/id/1/200/200"),
],
)
ex = Exchange(
provider=OpenAiProvider.from_env(),
model="gpt-4o",
system="Reply with yes or no.",
tools=[],
)
ex.add(user_message)
res = ex.reply()
# The assertion below is based on comparing two identical URLs.
assert "yes" in res.content[0].text.lower()
12 changes: 11 additions & 1 deletion packages/exchange/tests/test_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from exchange.message import Message
from exchange.content import Text, ToolUse, ToolResult
from exchange.content import Text, ToolUse, ToolResult, ImageUrl


def test_user_message():
Expand Down Expand Up @@ -38,6 +38,16 @@ def test_message_tool_result():
assert message.tool_result[0].output == "result"


def test_message_image_content():
img1 = ImageUrl(url="https://picsum.photos/id/1/200/200")
img2 = ImageUrl(url="https://picsum.photos/id/200/200/200")
message = Message(role="user", content=["here are two images", img1, img2])

assert len(message.image_content) == 2
assert message.image_content[0].url == "https://picsum.photos/id/1/200/200"
assert message.image_content[1].url == "https://picsum.photos/id/200/200/200"


def test_message_load(tmpdir):
# To emulate the expected relative lookup, we need to create a mock code dir
# and run the load in a subprocess
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ synopsis = "goose.synopsis.toolkit:SynopsisDeveloper"
browser = "goose.toolkit.web_browser:BrowserToolkit"
memory = "goose.toolkit.memory:Memory"
google_workspace = "goose.toolkit.google_workspace:GoogleWorkspace"
vision = "goose.toolkit.vision:VisionToolkit"

[project.entry-points."goose.profile"]
default = "goose.profile:default_profile"
Expand Down
40 changes: 40 additions & 0 deletions src/goose/toolkit/vision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
from exchange.content import ImageUrl
from exchange.providers.utils import encode_image
from exchange.exchange import Exchange
from exchange.message import Message
from exchange.providers.openai import OpenAiProvider
from goose.toolkit.base import Toolkit, tool

class VisionToolkit(Toolkit):
"""A toolkit for image analysis using AI capabilities."""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

@tool
def describe_image(self, image: str, instructions: str="Describe the image") -> str:
"""Analyze an image and return a description or other analysis based on the instructions.

Args:
image (ImageUrl): The URL or base64 encoded image to analyze.
instructions (str): Instructions for the AI on what kind of analysis to perform.
"""
if os.path.isfile(image):
encoded_image = encode_image(image)
image = f"data:image/jpeg;base64,{encoded_image}"

image = ImageUrl(url=image)
user_message = Message(role="user", content=[f"{instructions}: ", image])
exchange = Exchange(
provider=OpenAiProvider.from_env(),
model="gpt-4o-mini",
system="You are a helpful assistant.",
messages=[user_message],
tools=[],
)
assistant_response = exchange.reply()
return assistant_response.content[0].text

def system(self) -> str:
return """This toolkit allows you to visually analyze images using AI capabilities."""
Loading