-
Notifications
You must be signed in to change notification settings - Fork 3
/
world_model.py
243 lines (215 loc) · 16.1 KB
/
world_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import re
import os
import anthropic
from openai import OpenAI
import base64
from llms.utils import openai_messages_to_claude
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def parse_proposed_action(text):
operation_type_pattern = r'OPERATION TYPE:\s*([\s\S]+?)\n\n|OPERATION TYPE:\s*([\s\S]+)'
element_pattern = r'ELEMENT:\s*([\s\S]+?)\n\n|ELEMENT:\s*([\s\S]+)'
match = re.search(operation_type_pattern, text)
if match:
operation_type = match.group(1) if match.group(1) else match.group(2)
else:
operation_type = None
match = re.search(element_pattern, text)
if match:
element = match.group(1) if match.group(1) else match.group(2)
else:
element = None
return operation_type, element
class WebWorldModel:
def __init__(self, client):
self.client = client
def state_change_prediction_in_website(self, screenshot, task, action_description, format="change"):
prompt_text = " Please predict the changes after {}.".format(action_description)
if format == "change":
content = "You are an agent that predicts the effect of an action on a webpage. You will be given a screenshot of a webpage and an operation to perform on the webpage. You are required to predict the changes that will occur on the webpage after the operation is performed, such as the appearance of new elements, the disappearance of existing elements, or changes in the content of existing elements. The operation type and the element to operate will be provided in the prompt. Directly output 'State changes: ...' and don't output anything else. Try to be as comprehensive and detailed as possible."
elif format == "html":
content = "You are an agent that predicts the effect of an action on a webpage. You will be given a screenshot of a webpage and an operation to perform on the webpage. You are required to predict the state of the webpage after the operation is performed. In particular, you should generate the html code for the new webpage, highlight the most likely elements appearing in the new page. The operation type and the element to operate will be provided in the prompt. Directly output 'New webpage: ...' and don't output anything else. Try to be as comprehensive and detailed as possible."
elif format == "accessibility":
content = "You are an agent that predicts the effect of an action on a webpage. You will be given a screenshot of a webpage and an operation to perform on the webpage. You are required to predict the state of the webpage after the operation is performed. In particular, you should describe the new webpage as an accessibility tree, highlight the most likely elements appearing in the new page. The operation type and the element to operate will be provided in the prompt. Directly output 'New webpage: ...' and don't output anything else. Try to be as comprehensive and detailed as possible."
input_messages = [
{
"role": "system",
"content": content
},
{
"role": "user", "content":
[
{"type": "text", "text": prompt_text},
{"type": "image_url", "image_url": {
"url": screenshot,
"detail": "high"
}},
]},
]
if type(self.client) == anthropic.Anthropic:
# for claude
input_messages, system_message = openai_messages_to_claude(input_messages)
resp = self.client.messages.create(
model="claude-3-5-sonnet-20240620",
messages=input_messages,
max_tokens=2048,
system=system_message
)
output = resp.content[0].text
else:
# for openai
response = self.client.chat.completions.create(
model='gpt-4o',
messages=input_messages
)
output = [item.message.content for item in response.choices][0]
return output
def action_proposal_in_imagination(self, screenshot, task, imaginations, format="change"):
if format == "change":
prompt_text = "The above image is a screenshot of a web page. You are required to complete the following task: {}\n ".format(
task)
else:
prompt_text = "You are required to complete the following task: {}\n ".format(task)
prompt_text += "PREVIOUS ACTIONS: \n"
for i, item in enumerate(imaginations):
# todo: sometimes item[0] (i.e., the proposed action) can be None
prompt_text += str(i + 1) + " : " + item[0] + "\n"
prompt_text += "\n"
if format == "change":
prompt_text += "The above image is the screenshot before actually performing all the prevoius actions. The current webpage should be a combination of the initial screenshot with the following changes."
prompt_text += "The webpage has gone through several changes caused by previous actions:\n\n".format(
task)
for i, item in enumerate(imaginations):
prompt_text += "ACTION {}: \n{}\n{}\n\n".format(str(i + 1), item[0], item[1])
prompt_text += "Based on the initial screenshot and the changes to the webpage, please predict a single next step action to complete the given task. When proposing a new action, you must be faithful to the current state of the webpage (given by the initial screenshot and a series of state changes afterwards). This means the action proposed by you must be available given the current webpage state based on your evaluation. Please don't repeat any action from PREVIOUS ACTIONS. Please directly specify the operation in a short natural language description, including the operation type and the element to operate. Don't output anything else."
input_messages = [
{"role": "system",
"content": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. Available operation types include \n```click```: Use this to click.\n```type```: Use this to type the content into a field.\n```hover```: Hover over an element.\n```press```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n```scroll down``` or ```scroll up```: Scroll the page up or down.\n\nURL Navigation Actions:\n```goto```: Navigate to a specific URL.\n```go_back```: Navigate to the previously viewed page.\n```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n```stop```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket."},
{"role": "user",
"content":
[
{"type": "text", "text": prompt_text},
{"type": "image_url", "image_url": {
"url": screenshot,
"detail": "high"
}},
]},
]
else:
last_step = imaginations[-1]
prompt_text += "The current state of the webpage is as follows: \n{}\n".format(last_step[1])
prompt_text += "Based on the current state of the webpage, please predict a single next step action to complete the given task. When proposing a new action, you must be faithful to the current state of the webpage. This means the action proposed by you should be available given the current webpage state based on your evaluation. Please don't repeat any action from PREVIOUS ACTIONS. Please directly specify the operation in a short natural language description, including the operation type and the element to operate. Don't output anything else."
input_messages = [
{"role": "system",
"content": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. Available operation types include \n```click```: Use this to click.\n```type```: Use this to type the content into a field.\n```hover```: Hover over an element.\n```press```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n```scroll down``` or ```scroll up```: Scroll the page up or down.\n\nURL Navigation Actions:\n```goto```: Navigate to a specific URL.\n```go_back```: Navigate to the previously viewed page.\n```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n```stop```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket."},
{"role": "user",
"content": prompt_text}
]
if type(self.client) == anthropic.Anthropic:
# for claude
input_messages, system_message = openai_messages_to_claude(input_messages)
resp = self.client.messages.create(
model="claude-3-5-sonnet-20240620",
messages=input_messages,
max_tokens=2048,
system=system_message
)
output = resp.content[0].text
else:
# for openai
response = self.client.chat.completions.create(
model='gpt-4o',
messages=input_messages
)
output = [item.message.content for item in response.choices][0]
return output
def state_change_prediction_in_imagination(self, screenshot, task, imaginations, action, format="change"):
prompt_text = " Please predict the changes after action: {}".format(action)
for i, item in enumerate(imaginations):
prompt_text += "ACTION {}: \n{}\n{}\n\n".format(str(i + 1), item[0], item[1])
prompt_text += "Based on the initial screenshot and the changes to the webpage, please predict the changes after action: {}".format(
action)
if format == "change":
content = "You are an agent that predicts the effect of an action on a webpage. You will be given a screenshot of a webpage, a sequence of actions and state changes applied to the initial screenshot, and an operation to perform on the webpage. You are required to predict the new changes that will occur on the webpage after the operation is performed, such as the appearance of new elements, the disappearance of existing elements, or changes in the content of existing elements. The operation type and the element to operate will be provided in the prompt. Directly output 'State changes: ...' and don't output anything else. Try to be as comprehensive and detailed as possible."
elif format == "html":
content = "You are an agent that predicts the effect of an action on a webpage. You will be given a screenshot of a webpage, a sequence of actions and state changes applied to the initial screenshot, and an operation to perform on the webpage. You are required to predict the state of the webpage after the operation is performed. In particular, you should generate the html code for the new webpage, highlight the most likely elements appearing in the new page. The operation type and the element to operate will be provided in the prompt. Directly output 'New webpage: ...' and don't output anything else. Try to be as comprehensive and detailed as possible."
elif format == "accessibility":
content = "You are an agent that predicts the effect of an action on a webpage. You will be given a screenshot of a webpage, a sequence of actions and state changes applied to the initial screenshot, and an operation to perform on the webpage. You are required to predict the state of the webpage after the operation is performed. In particular, you should describe the new webpage as an accessibility tree, highlight the most likely elements appearing in the new page. The operation type and the element to operate will be provided in the prompt. Directly output 'New webpage: ...' and don't output anything else. Try to be as comprehensive and detailed as possible."
input_messages = [
{
"role": "system",
"content": content
},
{"role": "user",
"content":
[
{"type": "text", "text": prompt_text},
{"type": "image_url", "image_url": {
"url": screenshot,
"detail": "high"
}},
]},
]
if type(self.client) == anthropic.Anthropic:
# for claude
input_messages, system_message = openai_messages_to_claude(input_messages)
resp = self.client.messages.create(
model="claude-3-5-sonnet-20240620",
messages=input_messages,
max_tokens=2048,
system=system_message
)
output = resp.content[0].text
else:
# for openai
response = self.client.chat.completions.create(
model='gpt-4o',
messages=input_messages
)
output = [item.message.content for item in response.choices][0]
return output
def multiple_step_change_prediction(self,
screenshot,
screenshot_path, # this is for the sft model only
task,
action_description,
format="change",
k=0):
rtn_str = ''
imagination_list = []
rtn_str += "Proposed New Action: \n"
rtn_str += action_description + '\n'
# Initial State-Change on webpage
change_description = self.state_change_prediction_in_website(screenshot, task, action_description, format)
rtn_str += "Predicted new webpage: \n"
rtn_str += change_description + '\n'
imagination_list.append([action_description, change_description])
# State-Change in imagination
for i in range(k):
rtn_str += "==" * 25 + "STEP: " + str(i) + "==" * 25 + '\n'
# Propose action in the imagination world
proposed_action_output = self.action_proposal_in_imagination(screenshot, task, imagination_list, format)
rtn_str += "Proposed New Action: \n"
rtn_str += proposed_action_output + '\n'
proposed_action = proposed_action_output
# State-Change prediction in the imagination world
imagined_state_change = self.state_change_prediction_in_imagination(screenshot, task, imagination_list,
proposed_action, format)
rtn_str += "Predicted new webpage: \n"
rtn_str += imagined_state_change + '\n'
if "stop" in proposed_action_output.lower():
break
imagination_list.append([proposed_action, imagined_state_change])
return rtn_str
if __name__ == '__main__':
world_model = WebWorldModel(OpenAI(api_key=os.environ["OPENAI_API_KEY"]))
screenshot_path = "demo_data/shopping_0.png"
screenshot = encode_image(screenshot_path)
screenshot = "data:image/jpeg;base64," + screenshot
action_description = "type 'red blanket' in the search bar and click search"
task = "Buy the least expensive red blanket (in any size) from 'Blankets & Throws' category."
imagination = world_model.multiple_step_change_prediction(screenshot, screenshot_path, task,
action_description,
format='accessibility', k=3)
print(imagination)