Skip to content

Commit

Permalink
parse html
Browse files Browse the repository at this point in the history
  • Loading branch information
pankajastro committed Nov 30, 2023
1 parent 87cce4f commit 3de2485
Showing 1 changed file with 51 additions and 0 deletions.
51 changes: 51 additions & 0 deletions airflow/extract1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import time

def is_valid_url(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)


urls = set()


def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
continue
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid_url(href):
continue
# if href in exclude_links:
# continue
if href in urls:
continue
if domain_name not in href:
continue
urls.add(href)
return urls


def crawl(url):
links = get_all_website_links(url)
if links:
for link in links:
crawl(link)


start = time.time()
crawl("https://astronomer-providers.readthedocs.io/en/stable")
print(urls)
end = time.time()
print(end - start)

0 comments on commit 3de2485

Please sign in to comment.