-
Notifications
You must be signed in to change notification settings - Fork 0
/
webcrawling.py
62 lines (49 loc) · 1.94 KB
/
webcrawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import nest_asyncio
nest_asyncio.apply()
async def fetch_url(session, url):
try:
async with session.get(url) as response:
response.raise_for_status()
return await response.text()
except aiohttp.ClientError as e:
print(f"Error fetching {url}: {e}")
return None
async def crawl_webpage_async(url, max_depth, current_depth=0):
if current_depth > max_depth:
return []
try:
async with aiohttp.ClientSession() as session:
response_text = await fetch_url(session, url)
if not response_text:
return []
soup = BeautifulSoup(response_text, 'html.parser')
text_content = []
for element in soup.find_all(string=True):
if element.parent.name not in ['script', 'style']:
text_content.append(element.strip())
webpage_text = ' '.join(text_content)
print(f"Depth {current_depth}: {url}")
links = [link['href'] for link in soup.find_all('a', href=True)]
extracted_texts = [webpage_text]
tasks = []
for link in links:
absolute_link = urljoin(url, link)
tasks.append(crawl_webpage_async(absolute_link, max_depth, current_depth + 1))
extracted_texts += await asyncio.gather(*tasks)
return extracted_texts
except aiohttp.ClientError as e:
print(f"Error crawling {url}: {e}")
return []
if __name__ == "__main__":
start_url = 'https://www.t-systems.com/de/de'
loop = asyncio.get_event_loop()
extracted_texts = loop.run_until_complete(crawl_webpage_async(start_url, max_depth=1))
#storing texts
# for i, text in enumerate(extracted_texts):
# filename = f"./data/doc_{i}.txt"
# with open(filename, "w") as f:
# f.write(text)