Skip to content

Commit

Permalink
add timeout for page.content() (#541)
Browse files Browse the repository at this point in the history
  • Loading branch information
LawyZheng authored Jul 3, 2024
1 parent 22566f0 commit b791cc1
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 4 deletions.
1 change: 1 addition & 0 deletions skyvern/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
REPO_ROOT_DIR = SKYVERN_DIR.parent

INPUT_TEXT_TIMEOUT = 120000 # 2 minutes
PAGE_CONTENT_TIMEOUT = 300 # 5 mins


class ScrapeType(StrEnum):
Expand Down
4 changes: 2 additions & 2 deletions skyvern/forge/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
from skyvern.webeye.actions.responses import ActionResult
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, get_page_content, scrape_website

LOG = structlog.get_logger()

Expand Down Expand Up @@ -786,7 +786,7 @@ async def record_artifacts_after_action(self, task: Task, step: Step, browser_st
)

try:
html = await browser_state.page.content()
html = await get_page_content(browser_state.page)
await app.ARTIFACT_MANAGER.create_artifact(
step=step,
artifact_type=ArtifactType.HTML_ACTION,
Expand Down
19 changes: 17 additions & 2 deletions skyvern/webeye/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from playwright.async_api import Frame, Page
from pydantic import BaseModel

from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.constants import PAGE_CONTENT_TIMEOUT, SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.webeye.browser_factory import BrowserState
Expand Down Expand Up @@ -289,6 +289,16 @@ async def scrape_web_unsafe(

text_content = await get_frame_text(page.main_frame)

html = ""
try:
html = await get_page_content(page)
except Exception:
LOG.error(
"Failed out to get HTML content",
url=url,
exc_info=True,
)

return ScrapedPage(
elements=elements,
id_to_xpath_dict=id_to_xpath_dict,
Expand All @@ -298,11 +308,16 @@ async def scrape_web_unsafe(
element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
screenshots=screenshots,
url=page.url,
html=await page.content(),
html=html,
extracted_text=text_content,
)


async def get_page_content(page: Page, timeout: float = PAGE_CONTENT_TIMEOUT) -> str:
async with asyncio.timeout(timeout):
return await page.content()


async def get_select2_options(page: Page) -> list[dict[str, Any]]:
await page.evaluate(JS_FUNCTION_DEFS)
js_script = "async () => await getSelect2Options()"
Expand Down

0 comments on commit b791cc1

Please sign in to comment.