Skip to content

Commit

Permalink
use page headers to browse or fetch
Browse files Browse the repository at this point in the history
  • Loading branch information
Barbara Miller committed Apr 18, 2024
1 parent f2c89d1 commit 700c80b
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 54 deletions.
112 changes: 79 additions & 33 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,43 +244,89 @@ def brozzle_page(
self.logger.info("brozzling {}".format(page))
outlinks = set()

try:
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request
)
outlinks.update(browser_outlinks)
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)
self._get_page_headers(page)

if enable_youtube_dl and ydl.should_ytdlp(page):
if self._needs_browsing(page):
self.logger.info("needs browsing: %s", page)
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
raise
except brozzler.ProxyError:
raise
except Exception as e:
if (
hasattr(e, "exc_info")
and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 430
):
self.logger.info(
"youtube-dl got %s %s processing %s",
e.exc_info[1].code,
e.exc_info[1].msg,
page.url,
)
else:
self.logger.error(
"youtube_dl raised exception on %s", page, exc_info=True
)
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request
)
outlinks.update(browser_outlinks)
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)

if enable_youtube_dl and ydl.should_ytdlp(page):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
raise
except brozzler.ProxyError:
raise
except Exception as e:
if (
hasattr(e, "exc_info")
and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 430
):
self.logger.info(
"youtube-dl got %s %s processing %s",
e.exc_info[1].code,
e.exc_info[1].msg,
page.url,
)
else:
self.logger.error(
"youtube_dl raised exception on %s", page, exc_info=True
)
else:
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
return outlinks

def _get_page_headers(self, page):
with requests.get(page.url, stream=True) as r:
content_type_header = content_length_header = last_modified_header = None
if "Content-Type" in r.headers:
content_type_header = "Content-Type"
elif "content-length" in r.headers:
content_type_header = "content-length"
elif "CONTENT-LENGTH" in r.headers:
content_type_header = "CONTENT-LENGTH"
if content_type_header:
page.content_type = r.headers[content_type_header]
self.logger.info("url %s content_type is %s", page.url, page.content_type)

if "Content-Length" in r.headers:
content_length_header = "Content-Length"
elif "content-length" in r.headers:
content_length_header = "content-length"
elif "CONTENT-LENGTH" in r.headers:
content_length_header = "CONTENT-LENGTH"
if content_length_header:
page.content_length = int(r.headers[content_length_header])
self.logger.info("url %s content_length is %s", page.url, page.content_length)

if "Last-Modified" in r.headers:
last_modified_header = "Last-Modified"
elif "Last-Modified" in r.headers:
last_modified_header = "Last-Modified"
elif "LAST-MODIFIED" in r.headers:
last_modified_header = "LAST-MODIFIED"
if last_modified_header:
page.last_modified = r.headers[last_modified_header]
self.logger.info("url %s last_modified is %s", page.url, page.last_modified)

def _needs_browsing(self, page):
if page.content_type and "html" not in page.content_type:
return False
return True


def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_jpeg):
if on_screenshot:
Expand Down
22 changes: 1 addition & 21 deletions brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from yt_dlp.utils import match_filter_func
import brozzler
import urllib.request
from urllib.parse import urlparse
import tempfile
import urlcanon
import os
Expand All @@ -32,29 +31,10 @@

thread_local = threading.local()


def is_html_maybe(url):
if "chrome-error:" in url:
return False

skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp3", "mp4", "mpeg", "css", "js"]

parsed_url = urlparse(url)
base_url, ext = os.path.splitext(parsed_url.path)
ext = ext[1:]
for skip in skip_url_exts:
if ext.startswith(skip):
return False
return True


def should_ytdlp(page):
ytdlp_url = page.redirect_url if page.redirect_url else page.url

if not is_html_maybe(ytdlp_url):
logging.warning(
"skipping yt-dlp for %s due to unsupported extension", ytdlp_url
)
if "chrome-error:" in ytdlp_url:
return False

return True
Expand Down

0 comments on commit 700c80b

Please sign in to comment.