scrape.py

import ast
import json

from bs4 import BeautifulSoup
from langsmith.evaluation._runner import ET
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options as ChromeOptions
from dotenv import load_dotenv
import os

def scrape_website(url):
    load_dotenv()
    chrome_options = ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = Chrome(service=Service(os.getenv('/chrome-linux64')), options=chrome_options)

    driver.get(url) # this makes a GET request to the URL
    print("page loaded")
    html = driver.page_source
    return html

def extract_body_content(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    body_content = soup.body
    
    # check if content_type is json, xml, or javascript
    if body_content:
        content_type = body_content.get('data-type')

        if content_type == 'xml':
            xml_data = ET.fromstring(body_content.text)
            # Process XML data as needed
            return xml_data

        elif content_type == 'json':
            json_data = json.loads(body_content.text)
            # Process JSON data as needed
            return json_data

        elif content_type == 'javascript':
            try:
                js_data = ast.literal_eval(body_content.text)
                # Process JavaScript data as needed
                return js_data
            except (SyntaxError, ValueError):
                # Handle invalid JavaScript data
                return None

        else:
            return str(body_content)

    return ""

def clean_body_content(body_content):
    soup = BeautifulSoup(body_content, "html.parser")

    for script_or_style in soup(["script", "style"]):
        script_or_style.extract()

    # Get text or further process the content
    cleaned_content = soup.get_text(separator="\n")
    cleaned_content = "\n".join(
        line.strip() for line in cleaned_content.splitlines() if line.strip()
    )

    return cleaned_content


def split_dom_content(dom_content, max_length=6000):
    return [
        dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
    ]