-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
75 lines (59 loc) · 2.27 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import ast
import json
from bs4 import BeautifulSoup
from langsmith.evaluation._runner import ET
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options as ChromeOptions
from dotenv import load_dotenv
import os
def scrape_website(url):
load_dotenv()
chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = Chrome(service=Service(os.getenv('/chrome-linux64')), options=chrome_options)
driver.get(url) # this makes a GET request to the URL
print("page loaded")
html = driver.page_source
return html
def extract_body_content(html_content):
soup = BeautifulSoup(html_content, "html.parser")
body_content = soup.body
# check if content_type is json, xml, or javascript
if body_content:
content_type = body_content.get('data-type')
if content_type == 'xml':
xml_data = ET.fromstring(body_content.text)
# Process XML data as needed
return xml_data
elif content_type == 'json':
json_data = json.loads(body_content.text)
# Process JSON data as needed
return json_data
elif content_type == 'javascript':
try:
js_data = ast.literal_eval(body_content.text)
# Process JavaScript data as needed
return js_data
except (SyntaxError, ValueError):
# Handle invalid JavaScript data
return None
else:
return str(body_content)
return ""
def clean_body_content(body_content):
soup = BeautifulSoup(body_content, "html.parser")
for script_or_style in soup(["script", "style"]):
script_or_style.extract()
# Get text or further process the content
cleaned_content = soup.get_text(separator="\n")
cleaned_content = "\n".join(
line.strip() for line in cleaned_content.splitlines() if line.strip()
)
return cleaned_content
def split_dom_content(dom_content, max_length=6000):
return [
dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
]