diff --git a/.env.example b/.env.example index 426651a29..84381f848 100644 --- a/.env.example +++ b/.env.example @@ -33,6 +33,15 @@ AZURE_GPT4O_MINI_API_KEY="" AZURE_GPT4O_MINI_API_BASE="" AZURE_GPT4O_MINI_API_VERSION="" +# ENABLE_LLAMA: Set to true to enable Llama as a language model provider +ENABLE_LLAMA=false +# LLAMA_API_BASE: The base URL for Llama API (default: http://localhost:11434) +LLAMA_API_BASE="" +# LLAMA_MODEL_NAME: The model name to use (e.g., llama3.2-vision) +LLAMA_MODEL_NAME="" +# LLAMA_API_ROUTE: The API route for Llama (default: /api/chat) +LLAMA_API_ROUTE="" + # LLM_KEY: The chosen language model to use. This should be one of the models # provided by the enabled LLM providers (e.g., OPENAI_GPT4_TURBO, OPENAI_GPT4V, ANTHROPIC_CLAUDE3, AZURE_OPENAI_GPT4V). LLM_KEY="" diff --git a/Dockerfile b/Dockerfile index 1364616ba..c5b24c8e6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,15 +14,21 @@ RUN playwright install-deps RUN playwright install RUN apt-get install -y xauth x11-apps netpbm && apt-get clean +# Add these lines to install dos2unix and convert entrypoint scripts +RUN apt-get update && \ + apt-get install -y dos2unix && \ + apt-get clean + COPY . /app +# Convert line endings +RUN dos2unix /app/entrypoint-skyvern.sh && \ + chmod +x /app/entrypoint-skyvern.sh + ENV PYTHONPATH="/app:$PYTHONPATH" ENV VIDEO_PATH=/data/videos ENV HAR_PATH=/data/har ENV LOG_PATH=/data/log ENV ARTIFACT_STORAGE_PATH=/data/artifacts -COPY ./entrypoint-skyvern.sh /app/entrypoint-skyvern.sh -RUN chmod +x /app/entrypoint-skyvern.sh - CMD [ "/bin/bash", "/app/entrypoint-skyvern.sh" ] diff --git a/docker-compose.yml b/docker-compose.yml index 83844785d..03c1a666b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,9 +21,12 @@ services: retries: 5 skyvern: - image: public.ecr.aws/skyvern/skyvern:latest + # Replace the public image with a local build + build: + context: . + dockerfile: Dockerfile + # Keep the rest of the configuration restart: on-failure - # comment out if you want to externally call skyvern API ports: - 8000:8000 volumes: @@ -35,18 +38,20 @@ services: environment: - DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern - BROWSER_TYPE=chromium-headful - - ENABLE_OPENAI=true - - OPENAI_API_KEY= - # If you want to use other LLM provider, like azure and anthropic: - # - ENABLE_ANTHROPIC=true - # - LLM_KEY=ANTHROPIC_CLAUDE3_OPUS - # - ANTHROPIC_API_KEY= - # - ENABLE_AZURE=true - # - LLM_KEY=AZURE_OPENAI - # - AZURE_DEPLOYMENT= - # - AZURE_API_KEY= - # - AZURE_API_BASE= - # - AZURE_API_VERSION= + - ENABLE_LLAMA=true + - LLM_KEY=LLAMA3 + - LLAMA_API_BASE=http://192.168.1.65:11434 + - LLAMA_MODEL_NAME=llama3.2-vision + - LLAMA_API_ROUTE=/api/chat + - ENABLE_OPENAI=false + - ENABLE_ANTHROPIC=false + - ENABLE_AZURE=false + - ENABLE_BEDROCK=false + - ENABLE_AZURE_GPT4O_MINI=false + - LLAMA_BASE_URL=http://192.168.1.65:11434 + - LLAMA_MODEL=llama3.2-vision + - ENV=local + - SECONDARY_LLM_KEY=LLAMA3 depends_on: postgres: condition: service_healthy @@ -55,6 +60,8 @@ services: interval: 5s timeout: 5s retries: 5 + extra_hosts: + - "host.docker.internal:host-gateway" skyvern-ui: image: public.ecr.aws/skyvern/skyvern-ui:latest diff --git a/setup.sh b/setup.sh index 8b7a145ec..c4ef77331 100755 --- a/setup.sh +++ b/setup.sh @@ -9,7 +9,7 @@ log_event() { # Function to check if a command exists command_exists() { - command -v "$1" &> /dev/null + command -v "$1" &>/dev/null } ensure_required_commands() { @@ -31,7 +31,7 @@ update_or_add_env_var() { sed -i.bak "s/^$key=.*/$key=$value/" .env && rm -f .env.bak else # Add new variable - echo "$key=$value" >> .env + echo "$key=$value" >>.env fi } @@ -98,16 +98,25 @@ setup_llm_providers() { update_or_add_env_var "ENABLE_AZURE" "false" fi + echo "Do you want to enable Llama (y/n)?" + read enable_llama + if [[ "$enable_llama" == "y" ]]; then + read -p "Enter path to Llama model: " llama_model_path + update_or_add_env_var "ENABLE_LLAMA" "true" + update_or_add_env_var "LLAMA_MODEL_PATH" "$llama_model_path" + model_options+=("LLAMA_3_2_VISION") + fi + # Model Selection if [ ${#model_options[@]} -eq 0 ]; then echo "No LLM providers enabled. You won't be able to run Skyvern unless you enable at least one provider. You can re-run this script to enable providers or manually update the .env file." else echo "Available LLM models based on your selections:" for i in "${!model_options[@]}"; do - echo "$((i+1)). ${model_options[$i]}" + echo "$((i + 1)). ${model_options[$i]}" done read -p "Choose a model by number (e.g., 1 for ${model_options[0]}): " model_choice - chosen_model=${model_options[$((model_choice-1))]} + chosen_model=${model_options[$((model_choice - 1))]} echo "Chosen LLM Model: $chosen_model" update_or_add_env_var "LLM_KEY" "$chosen_model" fi @@ -115,7 +124,6 @@ setup_llm_providers() { echo "LLM provider configurations updated in .env." } - # Function to initialize .env file initialize_env_file() { if [ -f ".env" ]; then @@ -165,14 +173,16 @@ remove_poetry_env() { # Choose python version choose_python_version_or_fail() { - # https://github.com/python-poetry/poetry/issues/2117 - # Py --list-paths + # https://github.com/python-poetry/poetry/issues/2117 + # Py --list-paths # This will output which paths are being used for Python 3.11 - # Windows users need to poetry env use {{ Py --list-paths with 3.11}} - poetry env use python3.11 || { echo "Error: Python 3.11 is not installed. If you're on Windows, check out https://github.com/python-poetry/poetry/issues/2117 to unblock yourself"; exit 1; } + # Windows users need to poetry env use {{ Py --list-paths with 3.11}} + poetry env use python3.11 || { + echo "Error: Python 3.11 is not installed. If you're on Windows, check out https://github.com/python-poetry/poetry/issues/2117 to unblock yourself" + exit 1 + } } - # Function to install dependencies install_dependencies() { poetry install @@ -211,9 +221,9 @@ setup_postgresql() { return 0 fi fi - + # Check if Docker is installed and running - if ! command_exists docker || ! docker info > /dev/null 2>&1; then + if ! command_exists docker || ! docker info >/dev/null 2>&1; then echo "Docker is not running or not installed. Please install or start Docker and try again." exit 1 fi @@ -221,7 +231,7 @@ setup_postgresql() { # Check if PostgreSQL is already running in a Docker container if docker ps | grep -q postgresql-container; then echo "PostgreSQL is already running in a Docker container." - else + else # Attempt to install and start PostgreSQL using Docker echo "Attempting to install PostgreSQL via Docker..." docker run --name postgresql-container -e POSTGRES_HOST_AUTH_METHOD=trust -d -p 5432:5432 postgres:14 @@ -229,7 +239,7 @@ setup_postgresql() { # Wait for PostgreSQL to start echo "Waiting for PostgreSQL to start..." - sleep 20 # Adjust sleep time as necessary + sleep 20 # Adjust sleep time as necessary fi # Assuming docker exec works directly since we've checked Docker's status before @@ -272,7 +282,7 @@ create_organization() { fi # Update the secrets-open-source.toml file - echo -e "[skyvern]\nconfigs = [\n {\"env\" = \"local\", \"host\" = \"http://127.0.0.1:8000/api/v1\", \"orgs\" = [{name=\"Skyvern\", cred=\"$api_token\"}]}\n]" > .streamlit/secrets.toml + echo -e "[skyvern]\nconfigs = [\n {\"env\" = \"local\", \"host\" = \"http://127.0.0.1:8000/api/v1\", \"orgs\" = [{name=\"Skyvern\", cred=\"$api_token\"}]}\n]" >.streamlit/secrets.toml echo ".streamlit/secrets.toml file updated with organization details." # Check if skyvern-frontend/.env exists and back it up diff --git a/skyvern/__init__.py b/skyvern/__init__.py index 502cde743..c6df0049c 100644 --- a/skyvern/__init__.py +++ b/skyvern/__init__.py @@ -2,6 +2,8 @@ from ddtrace.filters import FilterRequestsOnUrl from skyvern.forge.sdk.forge_log import setup_logger +from typing import Any, List +from skyvern.forge.sdk.models import Step tracer.configure( settings={ @@ -11,3 +13,12 @@ }, ) setup_logger() + +async def llama_handler( + prompt: str, + step: Step | None = None, + screenshots: list[bytes] | None = None, + parameters: dict[str, Any] | None = None, +) -> dict[str, Any]: + # Implement Llama 3.2 vision API integration here + ... diff --git a/skyvern/config.py b/skyvern/config.py index aed4b70ba..54d2e0e77 100644 --- a/skyvern/config.py +++ b/skyvern/config.py @@ -5,7 +5,26 @@ class Settings(BaseSettings): - model_config = SettingsConfigDict(env_file=(".env", ".env.staging", ".env.prod"), extra="ignore") + # Use only model_config, not Config class + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore" + ) + + # Llama Configuration + ENABLE_LLAMA: bool = True + LLAMA_API_BASE: str = "http://192.168.1.65:11434" + LLAMA_MODEL_NAME: str = "llama3.2-vision" + LLAMA_API_ROUTE: str = "/api/chat" + LLM_KEY: str = "LLAMA3" + SECONDARY_LLM_KEY: str = "LLAMA3" + + # Disable other providers + ENABLE_OPENAI: bool = False + ENABLE_ANTHROPIC: bool = False + ENABLE_AZURE: bool = False + ENABLE_BEDROCK: bool = False ADDITIONAL_MODULES: list[str] = [] @@ -18,6 +37,14 @@ class Settings(BaseSettings): BROWSER_SCREENSHOT_TIMEOUT_MS: int = 20000 BROWSER_LOADING_TIMEOUT_MS: int = 120000 OPTION_LOADING_TIMEOUT_MS: int = 600000 + MAX_SCRAPING_RETRIES: int = 0 + VIDEO_PATH: str | None = None + HAR_PATH: str | None = "./har" + LOG_PATH: str = "./log" + BROWSER_ACTION_TIMEOUT_MS: int = 5000 + BROWSER_SCREENSHOT_TIMEOUT_MS: int = 20000 + BROWSER_LOADING_TIMEOUT_MS: int = 120000 + OPTION_LOADING_TIMEOUT_MS: int = 600000 MAX_STEPS_PER_RUN: int = 75 MAX_NUM_SCREENSHOTS: int = 10 # Ratio should be between 0 and 1. @@ -91,8 +118,8 @@ class Settings(BaseSettings): # LLM Configuration # ##################### # ACTIVE LLM PROVIDER - LLM_KEY: str = "OPENAI_GPT4O" - SECONDARY_LLM_KEY: str | None = None + LLM_KEY: str = "LLAMA3" # Change default from OPENAI_GPT4O + SECONDARY_LLM_KEY: str = "LLAMA3" # Also set this to LLAMA3 # COMMON LLM_CONFIG_TIMEOUT: int = 300 LLM_CONFIG_MAX_TOKENS: int = 4096 @@ -126,6 +153,9 @@ class Settings(BaseSettings): SVG_MAX_LENGTH: int = 100000 + # Add debug property + DEBUG: bool = True + def is_cloud_environment(self) -> bool: """ :return: True if env is not local, else False diff --git a/skyvern/forge/prompts.py b/skyvern/forge/prompts.py index ce836d93e..cb7d42a9d 100644 --- a/skyvern/forge/prompts.py +++ b/skyvern/forge/prompts.py @@ -1,4 +1,5 @@ from skyvern.forge.sdk.prompting import PromptEngine # Initialize the prompt engine -prompt_engine = PromptEngine("skyvern") +prompt_engine = PromptEngine("ollama") +prompt_engine_llama = PromptEngine("ollama") diff --git a/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 b/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 new file mode 100644 index 000000000..2d99c81c9 --- /dev/null +++ b/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 @@ -0,0 +1,45 @@ +You are a JSON API endpoint that answers questions based on user details and goals. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Answer user questions based on provided information +- Use exact information from user details +- Keep answers direct and concise +- Fill in answers as JSON key-value pairs + +Input data: +User's goal: {{ navigation_goal }} +User's details: {{ navigation_payload }} +User's questions: {{ queries_and_answers }} + +Instructions for answering: +1. Read each question carefully +2. Find relevant information in user's goal and details +3. Provide only the exact information needed +4. Include answers in the JSON response +5. Keep answers direct - no explanations +6. Use precise values from provided details + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. NO additional formatting or whitespace +6. Response must be pure JSON only + +Response format (replace with actual answers): +{ + "question_1": "", + "question_2": "", + "question_3": "" +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure + +These answers will be used to fill out information on a webpage automatically. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 b/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 new file mode 100644 index 000000000..86d595e14 --- /dev/null +++ b/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 @@ -0,0 +1,61 @@ +You are a JSON API endpoint for auto-completion analysis. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Analyze auto-completion attempts for input fields +- Evaluate suggested options against user goals +- Select the most appropriate option +- Return analysis in strict JSON format + +Auto-completion Detection Rules: +1. Count as attempt if: + - Multiple suggestions appear + - Even "No results" messages indicate an attempt +2. Valid suggestions must: + - Have an ID from provided HTML elements + - Contain meaningful content (not just "No results") + - Match user goals and context + +Analysis Requirements: +1. Check for auto-completion presence +2. Evaluate suggestion relevance +3. Consider user goals and details +4. Select best matching element +5. Provide confidence ratings +6. Use only existing element IDs + +Input Data: +Context: Choose an auto-completion suggestion for "{{ field_information }}" +Input value: {{ filled_value }} +User goal: {{ navigation_goal }} +User details: {{ navigation_payload_str }} +HTML elements: {{ elements }} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "auto_completion_attempt": false, // true if attempt detected + "reasoning": "", // brief reason for decision + "confidence_float": 0.0, // 0.0 to 1.0 + "relevance_float": 0.0, // 0.00 to 1.00 + "value": "", // selected value + "id": null // element ID or null +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types + +This response will be used for automated webpage interaction. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 b/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 new file mode 100644 index 000000000..f51af90a6 --- /dev/null +++ b/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 @@ -0,0 +1,57 @@ +You are a JSON API endpoint for generating alternative input values. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Generate 10 alternative values for failed auto-completion +- Maintain same core meaning as original value +- Provide variations that might match system expectations +- Return strictly formatted JSON array of options + +Value Generation Rules: +1. Create variations by: + - Using subset of original value + - Using superset of original value + - Summarizing original value + - Removing unnecessary details +2. Each variation must: + - Keep core meaning intact + - Not add new information + - Be more concise when possible +3. Order by relevance (highest to lowest) + +Input Data: +Context: Choose an auto-completion suggestion for "{{ field_information }}" +Current Value: {{ current_value }} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Exactly 10 variations required + +Required Response Format: +{ + "potential_values": [ + { + "reasoning": "", // brief explanation of relationship to original + "relevance_float": 0.00, // 0.00 to 1.00, two decimal places + "value": "" // alternative value + } + // Repeat for total of 10 values + ] +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Fewer or more than 10 values +- Missing or extra fields +- Invalid value types +- Invalid relevance range + +This response will be used for automated value suggestion. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 b/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 new file mode 100644 index 000000000..4d93afb1b --- /dev/null +++ b/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 @@ -0,0 +1,55 @@ +You are a JSON API endpoint for value refinement after failed auto-completions. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Analyze failed auto-completion attempts +- Identify patterns in popup suggestions +- Extract common concepts if present +- Generate refined input value +- Return analysis in strict JSON format + +Value Refinement Rules: +1. Current value modifications: + - Must relate to original value + - Can be subset or superset + - Must maintain core meaning +2. Popup element handling: + - Identify common patterns + - Extract shared concepts + - Don't copy exact values + - Use concept for guidance only + +Input Data: +Context: Choose an auto-completion suggestion for "{{ field_information }}" +Current Value: {{ current_value }} +Tried Values: {{ tried_values }} +Popped Elements: {{ popped_up_elements }} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "is_any_popped_up_elements": false, // true if popups detected + "common_concept": null, // concept or null + "reasoning": "", // brief reason for changes + "confidence_float": 0.0, // 0.0 to 1.0 + "tweaked_value": "" // modified value +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Direct copying of popup values + +This response will be used for automated value refinement. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/check-user-goal.j2 b/skyvern/forge/prompts/ollama/check-user-goal.j2 new file mode 100644 index 000000000..7545b486d --- /dev/null +++ b/skyvern/forge/prompts/ollama/check-user-goal.j2 @@ -0,0 +1,57 @@ +You are a JSON API endpoint for analyzing goal completion status. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Analyze webpage content against user goals +- Check if user objective is complete +- Evaluate page elements and content +- Provide structured analysis in JSON +- Return clear completion status + +Analysis Requirements: +1. Page Information: + - Identify relevant page elements + - Extract useful content + - Match elements to user goal + - Document key findings +2. Analysis Process: + - Compare page state to goal + - Evaluate completion criteria + - Check required elements + - Verify user details match +3. Goal Status: + - Determine if goal is met + - Provide evidence-based decision + - Use strict true/false evaluation + +Input Data: +Elements on page: {{ elements }} +User Goal: {{ navigation_goal }} +User Details: {{ navigation_payload }} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "page_info": "", // relevant page information and findings + "thoughts": "", // analysis of goal completion evidence + "user_goal_achieved": false // true if goal completed, false if not +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Incorrect boolean format + +This response will be used for automated goal verification. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/css-shape-convert.j2 b/skyvern/forge/prompts/ollama/css-shape-convert.j2 new file mode 100644 index 000000000..6f5ec65ea --- /dev/null +++ b/skyvern/forge/prompts/ollama/css-shape-convert.j2 @@ -0,0 +1,46 @@ +You are a JSON API endpoint for visual element analysis. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Analyze HTML element appearance +- Identify visual shape and meaning +- Provide confidence rating +- Return analysis in strict JSON format + +Analysis Requirements: +1. Shape Description: + - Brief, clear description + - Include visual appearance + - Include implied meaning + - Keep description concise +2. Confidence Rating: + - Rate certainty of analysis + - Use 0.0 to 1.0 scale + - Consider clarity of shape + - Consider common usage + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "confidence_float": 0.0, // 0.0 to 1.0 + "shape": "" // brief description +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Invalid confidence range + +This response will be used for automated element classification. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/custom-select.j2 b/skyvern/forge/prompts/ollama/custom-select.j2 new file mode 100644 index 000000000..550f5429c --- /dev/null +++ b/skyvern/forge/prompts/ollama/custom-select.j2 @@ -0,0 +1,62 @@ +You are a JSON API endpoint for HTML element selection and input. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Perform {{ "multi-level selection" if select_history else "selection" }} on webpage +- Choose best matching element or input value +- Consider user goals and context +- Return decision in strict JSON format + +Selection Rules: +1. Element Matching: + - Match to user goal and details + - Consider fallback options if needed + - Never select placeholders + - Skip loading indicators + - Required fields must have value{% if select_history %} + - Consider selection history + - Complete multi-level process{% endif %} + +2. Action Types: + - CLICK: Select existing option + - INPUT_TEXT: Search only if no valid options + +Input Data: +Context: Select an option for "{{ field_information }}" ({{ "required" if required_field else "optional" }}) +{% if target_value %}Target Value: {{ target_value }}{% endif %} +User Goal: {{ navigation_goal }} +User Details: {{ navigation_payload_str }} +Elements: {{ elements }} +{% if select_history %}Selection History: {{ select_history }}{% endif %} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "reasoning": "", // brief reason for selection + "confidence_float": 0.0, // 0.0 to 1.0 + "id": "", // element ID from list + "action_type": "", // "CLICK" or "INPUT_TEXT" + "value": ""{% if target_value %}, + "relevant": false // true if matches target{% endif %} +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Invalid action_type values +- Empty required fields +- Placeholder selections + +This response will be used for automated form interaction. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/extract-action.j2 b/skyvern/forge/prompts/ollama/extract-action.j2 new file mode 100644 index 000000000..ad1829091 --- /dev/null +++ b/skyvern/forge/prompts/ollama/extract-action.j2 @@ -0,0 +1,55 @@ +EXECUTE THIS BROWSER TASK NOW: + +URL: {{ current_url }} +GOAL: {{ navigation_goal }} +ELEMENTS: {{ elements }} +USER_INPUT: {{ navigation_payload_str }} + +RETURN JSON WITH THESE EXACT ACTIONS: +1. INPUT_TEXT - For typing in search/text fields +2. CLICK - For buttons and links +3. SELECT_OPTION - For dropdowns +4. CHECKBOX - For checkboxes +5. WAIT - When waiting needed +6. SOLVE_CAPTCHA - For captchas +7. TERMINATE - If goal impossible +8. COMPLETE - When goal achieved +9. UPLOAD_FILE - For file uploads +10. NULL_ACTION - When no action needed + +EXAMPLE RESPONSE: +{ + "user_goal_stage": "Starting search", + "user_goal_achieved": false, + "action_plan": "Search for product", + "actions": [ + { + "reasoning": "Enter search term", + "user_detail_query": "What to search?", + "user_detail_answer": "search term", + "confidence_float": 1.0, + "action_type": "INPUT_TEXT", + "id": "an html element id from 'ELEMENTS' provided above. Scan it and find the correct id for the action. An id in html looks like this "