From 8c2a733ba2ca8152a2c67e50d8a2e4246e2cbf40 Mon Sep 17 00:00:00 2001
From: LawyZheng <lawyzheng1106@gmail.com>
Date: Sat, 14 Sep 2024 17:28:08 +0800
Subject: [PATCH] refactor custom-select/auto-complete context (#830)

---
 skyvern/exceptions.py                         |  8 +-
 .../forge/prompts/skyvern/custom-select.j2    |  2 +-
 .../forge/prompts/skyvern/extract-action.j2   |  2 -
 .../skyvern/parse-input-or-select-context.j2  | 21 +++++
 skyvern/webeye/actions/actions.py             | 24 ++----
 skyvern/webeye/actions/handler.py             | 82 ++++++++++++-------
 6 files changed, 86 insertions(+), 53 deletions(-)
 create mode 100644 skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2

diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py
index bb181103d..f6154602e 100644
--- a/skyvern/exceptions.py
+++ b/skyvern/exceptions.py
@@ -452,11 +452,9 @@ def __init__(self, element_id: str) -> None:
         )
 
 
-class NoLabelOrValueForCustomSelection(SkyvernException):
-    def __init__(self, element_id: str) -> None:
-        super().__init__(
-            f"This is a custom selection, there must be invalid text for option.label or option.value. element_id={element_id}"
-        )
+class NoAvailableOptionFoundForCustomSelection(SkyvernException):
+    def __init__(self, reason: str | None) -> None:
+        super().__init__(f"No available option to select. reason: {reason}.")
 
 
 class NoElementMatchedForTargetOption(SkyvernException):
diff --git a/skyvern/forge/prompts/skyvern/custom-select.j2 b/skyvern/forge/prompts/skyvern/custom-select.j2
index e097eae98..3aeea5478 100644
--- a/skyvern/forge/prompts/skyvern/custom-select.j2
+++ b/skyvern/forge/prompts/skyvern/custom-select.j2
@@ -1,4 +1,4 @@
-You are performing a selection action on an HTML page. Assist the user in selecting the most appropriate option to advance toward their goal, considering the context, user details, and the DOM elements provided in the list.
+You are performing a {{ "multi-level selection" if select_history else "selection" }} action on an HTML page. Assist the user in selecting the most appropriate option to advance toward their goal, considering the context, user details, and the DOM elements provided in the list.
 
 You can identify the matching element based on the following guidelines:
   1. Select the most suitable element based on the user goal, user details, and the context.
diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2
index 76c29f6b6..12c7af030 100644
--- a/skyvern/forge/prompts/skyvern/extract-action.j2
+++ b/skyvern/forge/prompts/skyvern/extract-action.j2
@@ -17,8 +17,6 @@ Reply in JSON format with the following keys:
         "reasoning": str, // The reasoning behind the action. Be specific, referencing any user information and their fields and element ids in your reasoning. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point.
         "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
         "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
-        "field_information": str, // The target field for the action. Only for INPUT_TEXT and SELECT_OPTION actions. Otherwise it should be null.
-        "required_field": bool, // True if it's a required field, otherwise false.
         "id": str, // The id of the element to take action on. The id has to be one from the elements list
         "text": str, // Text for INPUT_TEXT action only
         "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise.
diff --git a/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 b/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2
new file mode 100644
index 000000000..ff25f32ed
--- /dev/null
+++ b/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2
@@ -0,0 +1,21 @@
+You are a browser agent performing actions on the web. You are instructed to take an INPUT or SELECT action on the element(id: "{{ element_id }}"). Extract some detailed information from the context/reasoning, and double-check the information by analysing the HTML elements.
+
+MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
+
+Reply in the following JSON format:
+{
+    "thought": str, // A string to describe how you double-check the information to ensure the accuracy.
+    "field": str, // Which field is this action intended to fill out?
+    "is_required": bool, // True if this is a required field, otherwise false.
+}
+
+Existing reasoning context:
+```
+{{ action_reasoning }}
+```
+
+HTML elements:
+```
+{{ elements }}
+```
+
diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py
index 12411c206..3cf5f3104 100644
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -43,10 +43,16 @@ def __repr__(self) -> str:
         return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
 
 
+class InputOrSelectContext(BaseModel):
+    field: str | None = None
+    is_required: bool | None = None
+
+    def __repr__(self) -> str:
+        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required})"
+
+
 class Action(BaseModel):
     action_type: ActionType
-    field_information: str | None = None
-    required_field: bool | None = None
     confidence_float: float | None = None
     description: str | None = None
     reasoning: str | None = None
@@ -162,8 +168,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
 
     reasoning = action["reasoning"] if "reasoning" in action else None
     confidence_float = action["confidence_float"] if "confidence_float" in action else None
-    field_information = action["field_information"] if "field_information" in action else None
-    required_field = action["required_field"] if "required_field" in action else None
 
     if "action_type" not in action or action["action_type"] is None:
         return NullAction(reasoning=reasoning, confidence_float=confidence_float)
@@ -181,8 +185,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
     if action_type == ActionType.CLICK:
         file_url = action["file_url"] if "file_url" in action else None
         return ClickAction(
-            field_information=field_information,
-            required_field=required_field,
             element_id=element_id,
             reasoning=reasoning,
             confidence_float=confidence_float,
@@ -192,8 +194,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
 
     if action_type == ActionType.INPUT_TEXT:
         return InputTextAction(
-            field_information=field_information,
-            required_field=required_field,
             element_id=element_id,
             text=action["text"],
             reasoning=reasoning,
@@ -203,8 +203,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
     if action_type == ActionType.UPLOAD_FILE:
         # TODO: see if the element is a file input element. if it's not, convert this action into a click action
         return UploadFileAction(
-            field_information=field_information,
-            required_field=required_field,
             element_id=element_id,
             confidence_float=confidence_float,
             file_url=action["file_url"],
@@ -214,8 +212,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
     # This action is not used in the current implementation. Click actions are used instead.
     if action_type == ActionType.DOWNLOAD_FILE:
         return DownloadFileAction(
-            field_information=field_information,
-            required_field=required_field,
             element_id=element_id,
             file_name=action["file_name"],
             reasoning=reasoning,
@@ -232,8 +228,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
         if label is None and value is None and index is None:
             raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
         return SelectOptionAction(
-            field_information=field_information,
-            required_field=required_field,
             element_id=element_id,
             option=SelectOption(
                 label=label,
@@ -246,8 +240,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
 
     if action_type == ActionType.CHECKBOX:
         return CheckboxAction(
-            field_information=field_information,
-            required_field=required_field,
             element_id=element_id,
             is_checked=action["is_checked"],
             reasoning=reasoning,
diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py
index 7eafeada3..1e2977627 100644
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -29,10 +29,10 @@
     MissingFileUrl,
     MultipleElementsFound,
     NoAutoCompleteOptionMeetCondition,
+    NoAvailableOptionFoundForCustomSelection,
     NoElementMatchedForTargetOption,
     NoIncrementalElementFoundForAutoCompletion,
     NoIncrementalElementFoundForCustomSelection,
-    NoLabelOrValueForCustomSelection,
     NoSuitableAutoCompleteOption,
     OptionIndexOutOfBound,
     WrongElementToUploadFile,
@@ -58,6 +58,7 @@
     ActionType,
     CheckboxAction,
     ClickAction,
+    InputOrSelectContext,
     ScrapeResult,
     SelectOption,
     SelectOptionAction,
@@ -418,8 +419,6 @@ async def handle_input_text_action(
     if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_spinbtn_input():
         await skyvern_element.scroll_into_view()
         select_action = SelectOptionAction(
-            field_information=action.field_information,
-            required_field=action.required_field,
             reasoning=action.reasoning,
             element_id=skyvern_element.get_id(),
             option=SelectOption(label=text),
@@ -464,6 +463,7 @@ async def handle_input_text_action(
                     llm_handler=app.SECONDARY_LLM_API_HANDLER,
                     step=step,
                     task=task,
+                    target_value=text,
                 )
                 if result is not None:
                     return [result]
@@ -692,8 +692,6 @@ async def handle_select_option_action(
             )
             select_action = SelectOptionAction(
                 reasoning=action.reasoning,
-                field_information=action.field_information,
-                required_field=action.required_field,
                 element_id=selectable_child.get_id(),
                 option=action.option,
             )
@@ -1069,7 +1067,7 @@ async def fc_func(fc: FileChooser) -> None:
 
 
 async def choose_auto_completion_dropdown(
-    action: actions.InputTextAction,
+    context: InputOrSelectContext,
     page: Page,
     dom: DomUtil,
     text: str,
@@ -1133,7 +1131,7 @@ async def choose_auto_completion_dropdown(
         html = incremental_scraped.build_html_tree(incremental_element)
         auto_completion_confirm_prompt = prompt_engine.load_prompt(
             "auto-completion-choose-option",
-            field_information=action.field_information,
+            field_information=context.field,
             filled_value=text,
             navigation_goal=task.navigation_goal,
             navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1211,6 +1209,22 @@ async def input_or_auto_complete_input(
         element_id=skyvern_element.get_id(),
     )
 
+    prompt = prompt_engine.load_prompt(
+        "parse-input-or-select-context",
+        element_id=action.element_id,
+        action_reasoning=action.reasoning,
+        elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
+    )
+
+    json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
+    input_or_select_context = InputOrSelectContext.model_validate(json_response)
+    LOG.info(
+        "Parsed input/select context",
+        context=input_or_select_context,
+        task_id=task.task_id,
+        step_id=step.step_id,
+    )
+
     # 1. press the orignal text to see if there's a match
     # 2. call LLM to find 5 potential values based on the orginal text
     # 3. try each potential values from #2
@@ -1219,7 +1233,6 @@ async def input_or_auto_complete_input(
     # FIXME: try the whole loop for twice now, to prevent too many LLM calls
     MAX_AUTO_COMPLETE_ATTEMP = 2
     current_attemp = 0
-    context_reasoning = action.reasoning
     current_value = text
     result = AutoCompletionResult()
 
@@ -1235,7 +1248,7 @@ async def input_or_auto_complete_input(
             input_value=current_value,
         )
         result = await choose_auto_completion_dropdown(
-            action=action,
+            context=input_or_select_context,
             page=page,
             dom=dom,
             text=current_value,
@@ -1252,7 +1265,7 @@ async def input_or_auto_complete_input(
 
         prompt = prompt_engine.load_prompt(
             "auto-completion-potential-answers",
-            field_information=action.field_information,
+            field_information=input_or_select_context.field,
             current_value=current_value,
         )
 
@@ -1282,7 +1295,7 @@ async def input_or_auto_complete_input(
                 input_value=value,
             )
             result = await choose_auto_completion_dropdown(
-                action=action,
+                context=input_or_select_context,
                 page=page,
                 dom=dom,
                 text=value,
@@ -1307,7 +1320,7 @@ async def input_or_auto_complete_input(
             )
             prompt = prompt_engine.load_prompt(
                 "auto-completion-tweak-value",
-                field_information=action.field_information,
+                field_information=input_or_select_context.field,
                 current_value=current_value,
                 tried_values=json.dumps(tried_values),
                 popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
@@ -1321,7 +1334,7 @@ async def input_or_auto_complete_input(
                 "Ask LLM tweaked the current value with a new value",
                 step_id=step.step_id,
                 task_id=task.task_id,
-                field_information=action.field_information,
+                field_information=input_or_select_context.field,
                 current_value=current_value,
                 new_value=new_current_value,
             )
@@ -1341,13 +1354,28 @@ async def sequentially_select_from_dropdown(
     step: Step,
     task: Task,
     force_select: bool = False,
-    should_relevant: bool = True,
+    target_value: str = "",
 ) -> tuple[ActionResult | None, str | None]:
     """
     TODO: support to return all values retrieved from the sequentially select
     Only return the last value today
     """
 
+    prompt = prompt_engine.load_prompt(
+        "parse-input-or-select-context",
+        action_reasoning=action.reasoning,
+        element_id=action.element_id,
+        elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
+    )
+    json_response = await llm_handler(prompt=prompt, step=step)
+    input_or_select_context = InputOrSelectContext.model_validate(json_response)
+    LOG.info(
+        "Parsed input/select context",
+        context=input_or_select_context,
+        task_id=task.task_id,
+        step_id=step.step_id,
+    )
+
     # TODO: only suport the third-level dropdown selection now
     MAX_SELECT_DEPTH = 3
     values: list[str | None] = []
@@ -1356,7 +1384,7 @@ async def sequentially_select_from_dropdown(
     check_exist_funcs: list[CheckExistIDFunc] = [dom.check_id_in_dom]
     for i in range(MAX_SELECT_DEPTH):
         single_select_result = await select_from_dropdown(
-            action=action,
+            context=input_or_select_context,
             page=page,
             skyvern_frame=skyvern_frame,
             incremental_scraped=incremental_scraped,
@@ -1366,7 +1394,7 @@ async def sequentially_select_from_dropdown(
             task=task,
             select_history=select_history,
             force_select=force_select,
-            should_relevant=should_relevant,
+            target_value=target_value,
         )
         select_history.append(single_select_result)
         values.append(single_select_result.value)
@@ -1431,7 +1459,7 @@ def build_sequential_select_history(history_list: list[CustomSingleSelectResult]
 
 
 async def select_from_dropdown(
-    action: SelectOptionAction,
+    context: InputOrSelectContext,
     page: Page,
     skyvern_frame: SkyvernFrame,
     incremental_scraped: IncrementalScrapePage,
@@ -1441,11 +1469,11 @@ async def select_from_dropdown(
     task: Task,
     select_history: list[CustomSingleSelectResult] | None = None,
     force_select: bool = False,
-    should_relevant: bool = True,
+    target_value: str = "",
 ) -> CustomSingleSelectResult:
     """
     force_select: is used to choose an element to click even there's no dropdown menu;
-    should_relevant: only valid when force_select is "False". When "True", the chosen value must be relevant to the target value;
+    targe_value: only valid when force_select is "False". When target_value is not empty, the matched option must be relevent to target value;
     None will be only returned when:
         1. force_select is false and no dropdown menu popped
         2. force_select is false and match value is not relevant to the target value
@@ -1490,15 +1518,11 @@ async def select_from_dropdown(
 
     html = incremental_scraped.build_html_tree(element_tree=trimmed_element_tree)
 
-    target_value = action.option.label or action.option.value
-    if target_value is None:
-        raise NoLabelOrValueForCustomSelection(element_id=action.element_id)
-
     prompt = prompt_engine.load_prompt(
         "custom-select",
-        field_information=action.field_information,
-        required_field=action.required_field,
-        target_value=target_value if not force_select and should_relevant else "",
+        field_information=context.field,
+        required_field=context.is_required,
+        target_value="" if force_select else target_value,
         navigation_goal=task.navigation_goal,
         navigation_payload_str=json.dumps(task.navigation_payload),
         elements=html,
@@ -1526,11 +1550,11 @@ async def select_from_dropdown(
 
     element_id: str | None = json_response.get("id", None)
     if not element_id:
-        raise NoElementMatchedForTargetOption(target=target_value, reason=json_response.get("reasoning"))
+        raise NoAvailableOptionFoundForCustomSelection(reason=json_response.get("reasoning"))
 
-    if not force_select and should_relevant:
+    if not force_select and target_value:
         if not json_response.get("relevant", False):
-            LOG.debug(
+            LOG.info(
                 "The selected option is not relevant to the target value",
                 element_id=element_id,
                 task_id=task.task_id,