parse html

astronomer · Nov 30, 2023 · 3de2485 · 3de2485
1 parent 87cce4f
commit 3de2485
Showing 1 changed file with 51 additions and 0 deletions.
diff --git a/airflow/extract1.py b/airflow/extract1.py
@@ -0,0 +1,51 @@
+import requests
+from urllib.parse import urlparse, urljoin
+from bs4 import BeautifulSoup
+import time
+
+def is_valid_url(url):
+    parsed = urlparse(url)
+    return bool(parsed.netloc) and bool(parsed.scheme)
+
+
+urls = set()
+
+
+def get_all_website_links(url):
+    """
+    Returns all URLs that is found on `url` in which it belongs to the same website
+    """
+    domain_name = urlparse(url).netloc
+    soup = BeautifulSoup(requests.get(url).content, "html.parser")
+    for a_tag in soup.findAll("a"):
+        href = a_tag.attrs.get("href")
+        if href == "" or href is None:
+            continue
+        href = urljoin(url, href)
+        parsed_href = urlparse(href)
+        # remove URL GET parameters, URL fragments, etc.
+        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
+        if not is_valid_url(href):
+            continue
+        # if href in exclude_links:
+        #     continue
+        if href in urls:
+            continue
+        if domain_name not in href:
+            continue
+        urls.add(href)
+        return urls
+
+
+def crawl(url):
+    links = get_all_website_links(url)
+    if links:
+        for link in links:
+            crawl(link)
+
+
+start = time.time()
+crawl("https://astronomer-providers.readthedocs.io/en/stable")
+print(urls)
+end = time.time()
+print(end - start)