From b62c2caae05a291835f8d75ae0445826e5fbcd81 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Wed, 6 Nov 2024 23:20:45 +0800 Subject: [PATCH] remove screenshot when check user goal (#1147) --- skyvern/forge/agent.py | 67 ++++++++++++++++-------------- skyvern/forge/app.py | 4 +- skyvern/webeye/scraper/scraper.py | 68 +++++++++++++++++++++++++------ 3 files changed, 94 insertions(+), 45 deletions(-) diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 709b37a32..b0060dc70 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -838,38 +838,26 @@ async def agent_step( task_completes_on_download = task_block and task_block.complete_on_download and task.workflow_run_id if not has_decisive_action and not task_completes_on_download: - LOG.info("Checking if user goal is achieved after re-scraping the page") - # Check if navigation goal is achieved after re-scraping the page - new_scraped_page = await self._scrape_with_type( + working_page = await browser_state.must_get_working_page() + complete_action = await self.check_user_goal_complete( + page=working_page, + scraped_page=scraped_page, task=task, step=step, - browser_state=browser_state, - scrape_type=ScrapeType.NORMAL, - organization=organization, ) - if new_scraped_page is None: - LOG.warning("Failed to scrape the page before checking user goal success, skipping check...") - else: - working_page = await browser_state.must_get_working_page() - complete_action = await self.check_user_goal_complete( - page=working_page, - scraped_page=new_scraped_page, - task=task, - step=step, + if complete_action is not None: + LOG.info("User goal achieved, executing complete action") + complete_action.organization_id = task.organization_id + complete_action.workflow_run_id = task.workflow_run_id + complete_action.task_id = task.task_id + complete_action.step_id = step.step_id + complete_action.step_order = step.order + complete_action.action_order = len(detailed_agent_step_output.actions_and_results) + complete_results = await ActionHandler.handle_action( + scraped_page, task, step, working_page, complete_action ) - if complete_action is not None: - LOG.info("User goal achieved, executing complete action") - complete_action.organization_id = task.organization_id - complete_action.workflow_run_id = task.workflow_run_id - complete_action.task_id = task.task_id - complete_action.step_id = step.step_id - complete_action.step_order = step.order - complete_action.action_order = len(detailed_agent_step_output.actions_and_results) - complete_results = await ActionHandler.handle_action( - scraped_page, task, step, working_page, complete_action - ) - detailed_agent_step_output.actions_and_results.append((complete_action, complete_results)) - await self.record_artifacts_after_action(task, step, browser_state) + detailed_agent_step_output.actions_and_results.append((complete_action, complete_results)) + await self.record_artifacts_after_action(task, step, browser_state) # If no action errors return the agent state and output completed_step = await self.update_step( step=step, @@ -913,11 +901,19 @@ async def check_user_goal_complete( page: Page, scraped_page: ScrapedPage, task: Task, step: Step ) -> CompleteAction | None: try: + LOG.info( + "Checking if user goal is achieved after re-scraping the page without screenshots", + task_id=task.task_id, + step_id=step.step_id, + workflow_run_id=task.workflow_run_id, + ) + scraped_page_without_screenshots = await scraped_page.refresh(with_screenshot=False) + verification_prompt = prompt_engine.load_prompt( "check-user-goal", navigation_goal=task.navigation_goal, navigation_payload=task.navigation_payload, - elements=scraped_page.build_element_tree(ElementTreeFormat.HTML), + elements=scraped_page_without_screenshots.build_element_tree(ElementTreeFormat.HTML), ) # this prompt is critical to our agent so let's use the primary LLM API handler @@ -926,6 +922,9 @@ async def check_user_goal_complete( LOG.error( "Invalid LLM response for user goal success verification, skipping verification", verification_response=verification_response, + task_id=task.task_id, + step_id=step.step_id, + workflow_run_id=task.workflow_run_id, ) return None @@ -940,7 +939,13 @@ async def check_user_goal_complete( ) except Exception: - LOG.error("LLM verification failed for complete action, skipping LLM verification", exc_info=True) + LOG.error( + "LLM verification failed for complete action, skipping LLM verification", + task_id=task.task_id, + step_id=step.step_id, + workflow_run_id=task.workflow_run_id, + exc_info=True, + ) return None async def record_artifacts_after_action(self, task: Task, step: Step, browser_state: BrowserState) -> None: @@ -1039,7 +1044,7 @@ async def _scrape_with_type( browser_state: BrowserState, scrape_type: ScrapeType, organization: Organization | None = None, - ) -> ScrapedPage | None: + ) -> ScrapedPage: if scrape_type == ScrapeType.NORMAL: pass diff --git a/skyvern/forge/app.py b/skyvern/forge/app.py index 3cc039333..630aa700d 100644 --- a/skyvern/forge/app.py +++ b/skyvern/forge/app.py @@ -1,7 +1,6 @@ from typing import Awaitable, Callable from fastapi import FastAPI -from playwright.async_api import Frame, Page from skyvern.forge.agent import ForgeAgent from skyvern.forge.agent_functions import AgentFunction @@ -17,6 +16,7 @@ from skyvern.forge.sdk.workflow.context_manager import WorkflowContextManager from skyvern.forge.sdk.workflow.service import WorkflowService from skyvern.webeye.browser_manager import BrowserManager +from skyvern.webeye.scraper.scraper import ScrapeExcludeFunc SETTINGS_MANAGER = SettingsManager.get_settings() DATABASE = AgentDB( @@ -37,7 +37,7 @@ WORKFLOW_CONTEXT_MANAGER = WorkflowContextManager() WORKFLOW_SERVICE = WorkflowService() AGENT_FUNCTION = AgentFunction() -scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None +scrape_exclude: ScrapeExcludeFunc | None = None authentication_function: Callable[[str], Awaitable[Organization]] | None = None setup_api_app: Callable[[FastAPI], None] | None = None diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 814257446..91c633b4c 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -3,11 +3,11 @@ import json from collections import defaultdict from enum import StrEnum -from typing import Any, Awaitable, Callable +from typing import Any, Awaitable, Callable, Self import structlog from playwright.async_api import Frame, Locator, Page -from pydantic import BaseModel +from pydantic import BaseModel, PrivateAttr from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, SKYVERN_DIR, SKYVERN_ID_ATTR from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat @@ -18,6 +18,7 @@ LOG = structlog.get_logger() CleanupElementTreeFunc = Callable[[Page | Frame, str, list[dict]], Awaitable[list[dict]]] +ScrapeExcludeFunc = Callable[[Page, Frame], Awaitable[bool]] RESERVED_ATTRIBUTES = { "accept", # for input file @@ -211,6 +212,26 @@ class ScrapedPage(BaseModel): html: str extracted_text: str | None = None + _browser_state: BrowserState = PrivateAttr() + _clean_up_func: CleanupElementTreeFunc = PrivateAttr() + _scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None) + + def __init__(self, **data: Any) -> None: + missing_attrs = [attr for attr in ["_browser_state", "_clean_up_func"] if attr not in data] + if len(missing_attrs) > 0: + raise ValueError(f"Missing required private attributes: {', '.join(missing_attrs)}") + + # popup private attributes + browser_state = data.pop("_browser_state") + clean_up_func = data.pop("_clean_up_func") + scrape_exclude = data.pop("_scrape_exclude") + + super().__init__(**data) + + self._browser_state = browser_state + self._clean_up_func = clean_up_func + self._scrape_exclude = scrape_exclude + def build_element_tree(self, fmt: ElementTreeFormat = ElementTreeFormat.JSON) -> str: if fmt == ElementTreeFormat.JSON: return json.dumps(self.element_tree_trimmed) @@ -220,13 +241,23 @@ def build_element_tree(self, fmt: ElementTreeFormat = ElementTreeFormat.JSON) -> raise UnknownElementTreeFormat(fmt=fmt) + async def refresh(self, with_screenshot: bool = True) -> Self: + return await scrape_website( + browser_state=self._browser_state, + url=self.url, + cleanup_element_tree=self._clean_up_func, + scrape_exclude=self._scrape_exclude, + with_screenshot=with_screenshot, + ) + async def scrape_website( browser_state: BrowserState, url: str, cleanup_element_tree: CleanupElementTreeFunc, num_retry: int = 0, - scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None, + scrape_exclude: ScrapeExcludeFunc | None = None, + with_screenshot: bool = True, ) -> ScrapedPage: """ ************************************************************************************************ @@ -251,7 +282,13 @@ async def scrape_website( """ try: num_retry += 1 - return await scrape_web_unsafe(browser_state, url, cleanup_element_tree, scrape_exclude) + return await scrape_web_unsafe( + browser_state=browser_state, + url=url, + cleanup_element_tree=cleanup_element_tree, + scrape_exclude=scrape_exclude, + with_screenshot=with_screenshot, + ) except Exception as e: # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES: @@ -272,6 +309,7 @@ async def scrape_website( cleanup_element_tree, num_retry=num_retry, scrape_exclude=scrape_exclude, + with_screenshot=with_screenshot, ) @@ -318,7 +356,8 @@ async def scrape_web_unsafe( browser_state: BrowserState, url: str, cleanup_element_tree: CleanupElementTreeFunc, - scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None, + scrape_exclude: ScrapeExcludeFunc | None = None, + with_screenshot: bool = True, ) -> ScrapedPage: """ Asynchronous function that performs web scraping without any built-in error handling. This function is intended @@ -331,10 +370,8 @@ async def scrape_web_unsafe( :return: Tuple containing Page instance, base64 encoded screenshot, and page elements. :note: This function does not handle exceptions. Ensure proper error handling in the calling context. """ - # We only create a new page if one does not exist. This is to allow keeping the same page since we want to - # continue working on the same page that we're taking actions on. - # *This also means URL is only used when creating a new page, and not when using an existing page. - page = await browser_state.get_or_create_page(url) + # browser state must have the page instance, otherwise we should not do scraping + page = await browser_state.must_get_working_page() # Take screenshots of the page with the bounding boxes. We will remove the bounding boxes later. # Scroll to the top of the page and take a screenshot. # Scroll to the next page and take a screenshot until we reach the end of the page. @@ -345,7 +382,11 @@ async def scrape_web_unsafe( LOG.info("Waiting for 5 seconds before scraping the website.") await asyncio.sleep(5) - screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True) + screenshots: list[bytes] = [] + + # TODO: do we need to scroll to the button when we scrape without screenshots? + if with_screenshot: + screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True) elements, element_tree = await get_interactable_element_tree(page, scrape_exclude) element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree)) @@ -384,6 +425,9 @@ async def scrape_web_unsafe( url=page.url, html=html, extracted_text=text_content, + _browser_state=browser_state, + _clean_up_func=cleanup_element_tree, + _scrape_exclude=scrape_exclude, ) @@ -391,7 +435,7 @@ async def get_interactable_element_tree_in_frame( frames: list[Frame], elements: list[dict], element_tree: list[dict], - scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None, + scrape_exclude: ScrapeExcludeFunc | None = None, ) -> tuple[list[dict], list[dict]]: for frame in frames: if frame.is_detached(): @@ -445,7 +489,7 @@ async def get_interactable_element_tree_in_frame( async def get_interactable_element_tree( page: Page, - scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None, + scrape_exclude: ScrapeExcludeFunc | None = None, ) -> tuple[list[dict], list[dict]]: """ Get the element tree of the page, including all the elements that are interactable.