forked from ishan0102/vimGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
perception.py
407 lines (356 loc) · 14 KB
/
perception.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
import base64
import json
import os
from io import BytesIO
import openai
from openai.types.chat import ChatCompletionMessageParam, ChatCompletionToolParam, ChatCompletionMessageToolCall, ChatCompletionMessageToolCallParam
from openai import _types
from dotenv import load_dotenv
from PIL.Image import Image
from typing import List
from glom import glom
from termcolor import colored
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
IMG_RES = 760
def resize_image(image: Image):
W, H = image.size
image = image.resize((IMG_RES, int(IMG_RES * H / W)))
return image
# Function to encode the image
def encode_and_resize(image: Image):
W, H = image.size
image = image.resize((IMG_RES, int(IMG_RES * H / W)))
buffer = BytesIO()
image.save(buffer, format="PNG")
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
return encoded_image
def build_action_hint_str(possible_actions_hints: dict[str, str] | None) -> str:
if not possible_actions_hints:
return ""
hint_details = "\n".join(
[f"{k}: {v}" for k, v in possible_actions_hints.items()])
return f'''
For this screenshot, here are some details for the possible hints in the image. If the next step is to perform an action, then use these to help you determine which action to take.
{hint_details}
'''
def build_function_calls() -> List[ChatCompletionToolParam]:
click_tool: ChatCompletionToolParam = {
"type": "function",
"function": {
"name": "click",
"description": "Click on a button or link",
"parameters": {
"type": "object",
"properties": {
"click": {
"type": "string",
"description": "The value for clicks is a 1-2 letter sequence found within a yellow box on top left of the item you want to click",
},
"description": {
"type": "string",
"description": "A terse description of what action is intended to be performed"
}
},
"required": ["click"],
},
},
}
type_and_click_tool: ChatCompletionToolParam = {
"type": "function",
"function": {
"name": "type_and_click",
"description": "Type text in a textbox and then click on a button or link",
"parameters": {
"type": "object",
"properties": {
"click": {
"type": "string",
"description": "The value for clicks is a 1-2 letter sequence found within a yellow box on top left of the item you want to click",
},
"type": {
"type": "string",
"description": "The text to type in the input textbox",
},
"description": {
"type": "string",
"description": "A terse description of what action is intended to be performed"
}
},
"required": ["click", "type"],
},
},
}
navigation_tool: ChatCompletionToolParam = {
"type": "function",
"function": {
"name": "navigate",
"description": "Navigate to a different website",
"parameters": {
"type": "object",
"properties": {
"navigate": {
"type": "string",
"description": "The URL to navigate to",
},
"description": {
"type": "string",
"description": "A terse description of what action is intended to be performed"
}
},
"required": ["navigate"],
},
},
}
done_tool: ChatCompletionToolParam = {
"type": "function",
"function": {
"name": "done",
"description": "Indicate that the objective is complete",
"parameters": {
"type": "object",
"properties": {},
},
},
}
query_result_tool: ChatCompletionToolParam = {
"type": "function",
"function": {
"name": "query_result",
"description": "Return the result of the objective. This is used for returning query results.",
"parameters": {
"type": "object",
"properties": {
"query_result": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title of the result"
},
"description": {
"type": "string",
"description": "The description of the result"
}
},
"required": ["title"]
}
},
},
"required": ["query_result"],
}
}
}
scroll_tool: ChatCompletionToolParam = {
"type": "function",
"function": {
"name": "scroll",
"description": "If you think the interesting part of the website is not visible, you can scroll down or up.",
"parameters": {
"type": "object",
"properties": {
"scroll": {
"type": "string",
"description": "The direction to scroll in. Allowed values are 'up' or 'down'"
},
"description": {
"type": "string",
"description": "A terse description of what action is intended to be performed"
}
},
"required": ["scroll"]
}
}
}
return [
click_tool,
type_and_click_tool,
navigation_tool,
scroll_tool,
done_tool,
query_result_tool,
]
def build_initial_prompt(
objective: str,
completion_condition: str,
current_url: str,
possible_actions_hints: dict[str, str]):
return f'''
Given the image of a website, your objective is: {objective} and the completion condition is: {completion_condition}. You are currently on the website: {current_url}.
DO NOT respond to the user under ANY circumstances. Only respond with a tools call.
{build_action_hint_str(possible_actions_hints)}
'''
def build_subsequent_prompt(current_url, possible_actions_hints: dict[str, str]):
return f'''What should the next action or result be? You are currently on the website: {current_url}.
{build_action_hint_str(possible_actions_hints)}
'''
def map_tool_call_to_param(tool_call: ChatCompletionMessageToolCall) -> ChatCompletionMessageToolCallParam:
return {
"id": tool_call.id,
"function": {
"arguments": tool_call.function.arguments,
"name": tool_call.function.name
},
"type": "function"
}
def get_actions(screenshot: Image,
objective: str,
completion_condition: str,
current_url: str,
possible_actions_hints: dict[str, str],
prompt_history: List[str | List[ChatCompletionMessageToolCall]]):
encoded_screenshot = encode_and_resize(screenshot)
# if prompt_history is empty
if not prompt_history:
next_prompt = build_initial_prompt(
objective, completion_condition, current_url, possible_actions_hints)
else:
next_prompt = build_subsequent_prompt(
current_url, possible_actions_hints)
tools = build_function_calls()
next_message: ChatCompletionMessageParam = {
"role": "user",
"content": [
{
"type": "text",
"text": next_prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_screenshot}",
},
}
],
}
messages: List[ChatCompletionMessageParam] = []
if not prompt_history:
messages.append(next_message)
else:
messages = []
for prompt in prompt_history:
message: ChatCompletionMessageParam
if isinstance(prompt, str):
messages.append({
"role": "assistant",
"content": prompt,
})
else:
if len(prompt) == 0:
continue
messages.append({
"role": "assistant",
"content": None,
"tool_calls": map(map_tool_call_to_param, prompt)
})
for tool_call in prompt:
messages.append({
"role": "tool",
"content": "Success",
"tool_call_id": tool_call.id
})
messages.append(next_message)
# pretty_print_conversation(messages)
tool_calls, json_response = query_open_ai_for_json(
messages, "gpt-4-vision-preview", tools)
if not prompt_history:
prompt_history.append(next_prompt)
prompt_history.append(tool_calls)
return json_response
def adjust_playbook(playbook, original_objective, incoming_objective):
prompt = f'''
This playbook was generated for the following objective {original_objective}.
The playbook is: {playbook}.
Adjust the playbook for the new objective: {incoming_objective}.
You are not allowed to add or remove any new actions to the playbook.
You may not change any of the keys in the playbook, only the values.
Return it as a valid JSON array.
'''
_, json_response = query_open_ai_for_json([{
"role": "user",
"content": prompt,
}], "gpt-3.5-turbo")
return json_response
def query_screenshot(screenshot: Image, objective):
encoded_screenshot = encode_and_resize(screenshot)
example_result = json.dumps(
{"query_result": [{"title": "some title"}, {"description": "some description"}]})
prompt = f'''
Given the image of this website, your objective is to: {objective}.
Return the result in {example_result}. The title and description are strings. Description is optional.
If you have no results, return null for the query_result field.
The result I want from you is a valid JSON object.
Do not return the JSON inside a code block. Only return 1 object with an array of "query_result" objects.
'''
_, json_response = query_open_ai_for_json([{
"role": "user",
"content": [
{
"type": "text",
"text": prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_screenshot}",
},
}
],
}], "gpt-4-vision-preview")
if ("query_result" in json_response and not json_response["query_result"]) \
or ("message" in json_response):
print("No query result found in response. Saving screenshot.")
# save screenshot for debugging
screenshot.save("screenshot.png")
return json_response
def query_open_ai_for_json(messages: List[ChatCompletionMessageParam], model, tools: List[ChatCompletionToolParam] | _types.NotGiven = _types.NotGiven(), max_tokens=130) -> tuple[List[ChatCompletionMessageToolCall], dict]:
response = openai.chat.completions.create(
model=model,
messages=messages,
tools=tools,
max_tokens=max_tokens,
)
print(f"Response: {response}")
tool_calls = glom(response, "choices.0.message.tool_calls", default=None)
value: ChatCompletionMessageToolCall | None = glom(
response, "choices.0.message.tool_calls.0", default=None)
if value == None:
print("No tool calls found in response")
raise Exception("No tool calls found in response")
function = value.function
if function == None:
print("No function found in tool call")
raise Exception("No function found in tool call")
try:
if function.name == 'done':
json_response = {"done": True}
else:
json_response = json.loads(function.arguments)
except json.JSONDecodeError:
print("Error: Invalid JSON response" + str(response.choices))
raise Exception("Error: Invalid JSON response" + str(response.choices))
return tool_calls, json_response
def pretty_print_conversation(messages: List[ChatCompletionMessageParam]):
for message in messages:
if message["role"] == "system":
print(colored(f"system: {message['content']}\n", "red"))
elif message["role"] == "user":
print(colored(
f"user: {list(
filter(
lambda content: True if isinstance(
content, str) else content["type"] != 'image_url',
message["content"]))}\n",
"green"
)
)
elif message["role"] == "assistant" and message.get("tool_calls"):
print(colored(f"assistant: {
list(glom(message, 'tool_calls'))}\n", color="blue"))
elif message["role"] == "assistant" and not message.get("tool_calls"):
print(colored(f"assistant: {glom(message, 'content')}\n", "blue"))
elif message["role"] == "function":
print(colored(f"function ({message['name']}): {
message['content']}\n", "magenta"))