From 700c80ba903250164d128a2cc10f8a549b4b4a7a Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 18 Apr 2024 15:41:00 -0700 Subject: [PATCH] use page headers to browse or fetch --- brozzler/worker.py | 112 ++++++++++++++++++++++++++++++++------------- brozzler/ydl.py | 22 +-------- 2 files changed, 80 insertions(+), 54 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index ef294342..259ccd69 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -244,43 +244,89 @@ def brozzle_page( self.logger.info("brozzling {}".format(page)) outlinks = set() - try: - browser_outlinks = self._browse_page( - browser, site, page, on_screenshot, on_request - ) - outlinks.update(browser_outlinks) - except brozzler.PageInterstitialShown: - self.logger.info("page interstitial shown (http auth): %s", page) + self._get_page_headers(page) - if enable_youtube_dl and ydl.should_ytdlp(page): + if self._needs_browsing(page): + self.logger.info("needs browsing: %s", page) try: - ydl_outlinks = ydl.do_youtube_dl(self, site, page) - outlinks.update(ydl_outlinks) - except brozzler.ReachedLimit as e: - raise - except brozzler.ShutdownRequested: - raise - except brozzler.ProxyError: - raise - except Exception as e: - if ( - hasattr(e, "exc_info") - and len(e.exc_info) >= 2 - and hasattr(e.exc_info[1], "code") - and e.exc_info[1].code == 430 - ): - self.logger.info( - "youtube-dl got %s %s processing %s", - e.exc_info[1].code, - e.exc_info[1].msg, - page.url, - ) - else: - self.logger.error( - "youtube_dl raised exception on %s", page, exc_info=True - ) + browser_outlinks = self._browse_page( + browser, site, page, on_screenshot, on_request + ) + outlinks.update(browser_outlinks) + except brozzler.PageInterstitialShown: + self.logger.info("page interstitial shown (http auth): %s", page) + + if enable_youtube_dl and ydl.should_ytdlp(page): + try: + ydl_outlinks = ydl.do_youtube_dl(self, site, page) + outlinks.update(ydl_outlinks) + except brozzler.ReachedLimit as e: + raise + except brozzler.ShutdownRequested: + raise + except brozzler.ProxyError: + raise + except Exception as e: + if ( + hasattr(e, "exc_info") + and len(e.exc_info) >= 2 + and hasattr(e.exc_info[1], "code") + and e.exc_info[1].code == 430 + ): + self.logger.info( + "youtube-dl got %s %s processing %s", + e.exc_info[1].code, + e.exc_info[1].msg, + page.url, + ) + else: + self.logger.error( + "youtube_dl raised exception on %s", page, exc_info=True + ) + else: + self.logger.info("needs fetch: %s", page) + self._fetch_url(site, page=page) return outlinks + def _get_page_headers(self, page): + with requests.get(page.url, stream=True) as r: + content_type_header = content_length_header = last_modified_header = None + if "Content-Type" in r.headers: + content_type_header = "Content-Type" + elif "content-length" in r.headers: + content_type_header = "content-length" + elif "CONTENT-LENGTH" in r.headers: + content_type_header = "CONTENT-LENGTH" + if content_type_header: + page.content_type = r.headers[content_type_header] + self.logger.info("url %s content_type is %s", page.url, page.content_type) + + if "Content-Length" in r.headers: + content_length_header = "Content-Length" + elif "content-length" in r.headers: + content_length_header = "content-length" + elif "CONTENT-LENGTH" in r.headers: + content_length_header = "CONTENT-LENGTH" + if content_length_header: + page.content_length = int(r.headers[content_length_header]) + self.logger.info("url %s content_length is %s", page.url, page.content_length) + + if "Last-Modified" in r.headers: + last_modified_header = "Last-Modified" + elif "Last-Modified" in r.headers: + last_modified_header = "Last-Modified" + elif "LAST-MODIFIED" in r.headers: + last_modified_header = "LAST-MODIFIED" + if last_modified_header: + page.last_modified = r.headers[last_modified_header] + self.logger.info("url %s last_modified is %s", page.url, page.last_modified) + + def _needs_browsing(self, page): + if page.content_type and "html" not in page.content_type: + return False + return True + + def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): if on_screenshot: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 635839b1..af0c3139 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -21,7 +21,6 @@ from yt_dlp.utils import match_filter_func import brozzler import urllib.request -from urllib.parse import urlparse import tempfile import urlcanon import os @@ -32,29 +31,10 @@ thread_local = threading.local() - -def is_html_maybe(url): - if "chrome-error:" in url: - return False - - skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp3", "mp4", "mpeg", "css", "js"] - - parsed_url = urlparse(url) - base_url, ext = os.path.splitext(parsed_url.path) - ext = ext[1:] - for skip in skip_url_exts: - if ext.startswith(skip): - return False - return True - - def should_ytdlp(page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - if not is_html_maybe(ytdlp_url): - logging.warning( - "skipping yt-dlp for %s due to unsupported extension", ytdlp_url - ) + if "chrome-error:" in ytdlp_url: return False return True