remove screenshot when check user goal (#1147)

Skyvern-AI · Nov 6, 2024 · b62c2ca · b62c2ca
1 parent c084764
commit b62c2ca
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 45 deletions.
diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py
@@ -838,38 +838,26 @@ async def agent_step(
 
             task_completes_on_download = task_block and task_block.complete_on_download and task.workflow_run_id
             if not has_decisive_action and not task_completes_on_download:
-                LOG.info("Checking if user goal is achieved after re-scraping the page")
-                # Check if navigation goal is achieved after re-scraping the page
-                new_scraped_page = await self._scrape_with_type(
+                working_page = await browser_state.must_get_working_page()
+                complete_action = await self.check_user_goal_complete(
+                    page=working_page,
+                    scraped_page=scraped_page,
                     task=task,
                     step=step,
-                    browser_state=browser_state,
-                    scrape_type=ScrapeType.NORMAL,
-                    organization=organization,
                 )
-                if new_scraped_page is None:
-                    LOG.warning("Failed to scrape the page before checking user goal success, skipping check...")
-                else:
-                    working_page = await browser_state.must_get_working_page()
-                    complete_action = await self.check_user_goal_complete(
-                        page=working_page,
-                        scraped_page=new_scraped_page,
-                        task=task,
-                        step=step,
+                if complete_action is not None:
+                    LOG.info("User goal achieved, executing complete action")
+                    complete_action.organization_id = task.organization_id
+                    complete_action.workflow_run_id = task.workflow_run_id
+                    complete_action.task_id = task.task_id
+                    complete_action.step_id = step.step_id
+                    complete_action.step_order = step.order
+                    complete_action.action_order = len(detailed_agent_step_output.actions_and_results)
+                    complete_results = await ActionHandler.handle_action(
+                        scraped_page, task, step, working_page, complete_action
                     )
-                    if complete_action is not None:
-                        LOG.info("User goal achieved, executing complete action")
-                        complete_action.organization_id = task.organization_id
-                        complete_action.workflow_run_id = task.workflow_run_id
-                        complete_action.task_id = task.task_id
-                        complete_action.step_id = step.step_id
-                        complete_action.step_order = step.order
-                        complete_action.action_order = len(detailed_agent_step_output.actions_and_results)
-                        complete_results = await ActionHandler.handle_action(
-                            scraped_page, task, step, working_page, complete_action
-                        )
-                        detailed_agent_step_output.actions_and_results.append((complete_action, complete_results))
-                        await self.record_artifacts_after_action(task, step, browser_state)
+                    detailed_agent_step_output.actions_and_results.append((complete_action, complete_results))
+                    await self.record_artifacts_after_action(task, step, browser_state)
             # If no action errors return the agent state and output
             completed_step = await self.update_step(
                 step=step,
@@ -913,11 +901,19 @@ async def check_user_goal_complete(
         page: Page, scraped_page: ScrapedPage, task: Task, step: Step
     ) -> CompleteAction | None:
         try:
+            LOG.info(
+                "Checking if user goal is achieved after re-scraping the page without screenshots",
+                task_id=task.task_id,
+                step_id=step.step_id,
+                workflow_run_id=task.workflow_run_id,
+            )
+            scraped_page_without_screenshots = await scraped_page.refresh(with_screenshot=False)
+
             verification_prompt = prompt_engine.load_prompt(
                 "check-user-goal",
                 navigation_goal=task.navigation_goal,
                 navigation_payload=task.navigation_payload,
-                elements=scraped_page.build_element_tree(ElementTreeFormat.HTML),
+                elements=scraped_page_without_screenshots.build_element_tree(ElementTreeFormat.HTML),
             )
 
             # this prompt is critical to our agent so let's use the primary LLM API handler
@@ -926,6 +922,9 @@ async def check_user_goal_complete(
                 LOG.error(
                     "Invalid LLM response for user goal success verification, skipping verification",
                     verification_response=verification_response,
+                    task_id=task.task_id,
+                    step_id=step.step_id,
+                    workflow_run_id=task.workflow_run_id,
                 )
                 return None
 
@@ -940,7 +939,13 @@ async def check_user_goal_complete(
             )
 
         except Exception:
-            LOG.error("LLM verification failed for complete action, skipping LLM verification", exc_info=True)
+            LOG.error(
+                "LLM verification failed for complete action, skipping LLM verification",
+                task_id=task.task_id,
+                step_id=step.step_id,
+                workflow_run_id=task.workflow_run_id,
+                exc_info=True,
+            )
             return None
 
     async def record_artifacts_after_action(self, task: Task, step: Step, browser_state: BrowserState) -> None:
@@ -1039,7 +1044,7 @@ async def _scrape_with_type(
         browser_state: BrowserState,
         scrape_type: ScrapeType,
         organization: Organization | None = None,
-    ) -> ScrapedPage | None:
+    ) -> ScrapedPage:
         if scrape_type == ScrapeType.NORMAL:
             pass
 

diff --git a/skyvern/forge/app.py b/skyvern/forge/app.py
@@ -1,7 +1,6 @@
 from typing import Awaitable, Callable
 
 from fastapi import FastAPI
-from playwright.async_api import Frame, Page
 
 from skyvern.forge.agent import ForgeAgent
 from skyvern.forge.agent_functions import AgentFunction
@@ -17,6 +16,7 @@
 from skyvern.forge.sdk.workflow.context_manager import WorkflowContextManager
 from skyvern.forge.sdk.workflow.service import WorkflowService
 from skyvern.webeye.browser_manager import BrowserManager
+from skyvern.webeye.scraper.scraper import ScrapeExcludeFunc
 
 SETTINGS_MANAGER = SettingsManager.get_settings()
 DATABASE = AgentDB(
@@ -37,7 +37,7 @@
 WORKFLOW_CONTEXT_MANAGER = WorkflowContextManager()
 WORKFLOW_SERVICE = WorkflowService()
 AGENT_FUNCTION = AgentFunction()
-scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None
+scrape_exclude: ScrapeExcludeFunc | None = None
 authentication_function: Callable[[str], Awaitable[Organization]] | None = None
 setup_api_app: Callable[[FastAPI], None] | None = None
 

diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py
@@ -3,11 +3,11 @@
 import json
 from collections import defaultdict
 from enum import StrEnum
-from typing import Any, Awaitable, Callable
+from typing import Any, Awaitable, Callable, Self
 
 import structlog
 from playwright.async_api import Frame, Locator, Page
-from pydantic import BaseModel
+from pydantic import BaseModel, PrivateAttr
 
 from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, SKYVERN_DIR, SKYVERN_ID_ATTR
 from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
@@ -18,6 +18,7 @@
 
 LOG = structlog.get_logger()
 CleanupElementTreeFunc = Callable[[Page | Frame, str, list[dict]], Awaitable[list[dict]]]
+ScrapeExcludeFunc = Callable[[Page, Frame], Awaitable[bool]]
 
 RESERVED_ATTRIBUTES = {
     "accept",  # for input file
@@ -211,6 +212,26 @@ class ScrapedPage(BaseModel):
     html: str
     extracted_text: str | None = None
 
+    _browser_state: BrowserState = PrivateAttr()
+    _clean_up_func: CleanupElementTreeFunc = PrivateAttr()
+    _scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None)
+
+    def __init__(self, **data: Any) -> None:
+        missing_attrs = [attr for attr in ["_browser_state", "_clean_up_func"] if attr not in data]
+        if len(missing_attrs) > 0:
+            raise ValueError(f"Missing required private attributes: {', '.join(missing_attrs)}")
+
+        # popup private attributes
+        browser_state = data.pop("_browser_state")
+        clean_up_func = data.pop("_clean_up_func")
+        scrape_exclude = data.pop("_scrape_exclude")
+
+        super().__init__(**data)
+
+        self._browser_state = browser_state
+        self._clean_up_func = clean_up_func
+        self._scrape_exclude = scrape_exclude
+
     def build_element_tree(self, fmt: ElementTreeFormat = ElementTreeFormat.JSON) -> str:
         if fmt == ElementTreeFormat.JSON:
             return json.dumps(self.element_tree_trimmed)
@@ -220,13 +241,23 @@ def build_element_tree(self, fmt: ElementTreeFormat = ElementTreeFormat.JSON) ->
 
         raise UnknownElementTreeFormat(fmt=fmt)
 
+    async def refresh(self, with_screenshot: bool = True) -> Self:
+        return await scrape_website(
+            browser_state=self._browser_state,
+            url=self.url,
+            cleanup_element_tree=self._clean_up_func,
+            scrape_exclude=self._scrape_exclude,
+            with_screenshot=with_screenshot,
+        )
+
 
 async def scrape_website(
     browser_state: BrowserState,
     url: str,
     cleanup_element_tree: CleanupElementTreeFunc,
     num_retry: int = 0,
-    scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
+    scrape_exclude: ScrapeExcludeFunc | None = None,
+    with_screenshot: bool = True,
 ) -> ScrapedPage:
     """
     ************************************************************************************************
@@ -251,7 +282,13 @@ async def scrape_website(
     """
     try:
         num_retry += 1
-        return await scrape_web_unsafe(browser_state, url, cleanup_element_tree, scrape_exclude)
+        return await scrape_web_unsafe(
+            browser_state=browser_state,
+            url=url,
+            cleanup_element_tree=cleanup_element_tree,
+            scrape_exclude=scrape_exclude,
+            with_screenshot=with_screenshot,
+        )
     except Exception as e:
         # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
         if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
@@ -272,6 +309,7 @@ async def scrape_website(
             cleanup_element_tree,
             num_retry=num_retry,
             scrape_exclude=scrape_exclude,
+            with_screenshot=with_screenshot,
         )
 
 
@@ -318,7 +356,8 @@ async def scrape_web_unsafe(
     browser_state: BrowserState,
     url: str,
     cleanup_element_tree: CleanupElementTreeFunc,
-    scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
+    scrape_exclude: ScrapeExcludeFunc | None = None,
+    with_screenshot: bool = True,
 ) -> ScrapedPage:
     """
     Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -331,10 +370,8 @@ async def scrape_web_unsafe(
     :return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
     :note: This function does not handle exceptions. Ensure proper error handling in the calling context.
     """
-    # We only create a new page if one does not exist. This is to allow keeping the same page since we want to
-    # continue working on the same page that we're taking actions on.
-    # *This also means URL is only used when creating a new page, and not when using an existing page.
-    page = await browser_state.get_or_create_page(url)
+    # browser state must have the page instance, otherwise we should not do scraping
+    page = await browser_state.must_get_working_page()
     # Take screenshots of the page with the bounding boxes. We will remove the bounding boxes later.
     # Scroll to the top of the page and take a screenshot.
     # Scroll to the next page and take a screenshot until we reach the end of the page.
@@ -345,7 +382,11 @@ async def scrape_web_unsafe(
     LOG.info("Waiting for 5 seconds before scraping the website.")
     await asyncio.sleep(5)
 
-    screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True)
+    screenshots: list[bytes] = []
+
+    # TODO: do we need to scroll to the button when we scrape without screenshots?
+    if with_screenshot:
+        screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True)
 
     elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
     element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
@@ -384,14 +425,17 @@ async def scrape_web_unsafe(
         url=page.url,
         html=html,
         extracted_text=text_content,
+        _browser_state=browser_state,
+        _clean_up_func=cleanup_element_tree,
+        _scrape_exclude=scrape_exclude,
     )
 
 
 async def get_interactable_element_tree_in_frame(
     frames: list[Frame],
     elements: list[dict],
     element_tree: list[dict],
-    scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
+    scrape_exclude: ScrapeExcludeFunc | None = None,
 ) -> tuple[list[dict], list[dict]]:
     for frame in frames:
         if frame.is_detached():
@@ -445,7 +489,7 @@ async def get_interactable_element_tree_in_frame(
 
 async def get_interactable_element_tree(
     page: Page,
-    scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
+    scrape_exclude: ScrapeExcludeFunc | None = None,
 ) -> tuple[list[dict], list[dict]]:
     """
     Get the element tree of the page, including all the elements that are interactable.