internetarchive · galgeek · Apr 24, 2024 · Apr 2, 2024 · Apr 4, 2024 · Apr 10, 2024
diff --git a/brozzler/worker.py b/brozzler/worker.py
@@ -3,7 +3,7 @@
 it runs yt-dlp on them, browses them and runs behaviors if appropriate,
 scopes and adds outlinks to the frontier
 
-Copyright (C) 2014-2023 Internet Archive
+Copyright (C) 2014-2024 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -242,11 +242,20 @@ def brozzle_page(
         enable_youtube_dl=True,
     ):
         self.logger.info("brozzling {}".format(page))
-        ydl_fetches = None
         outlinks = set()
-        if enable_youtube_dl and not page.url.lower().endswith(".pdf"):
+
+        try:
+            browser_outlinks = self._browse_page(
+                browser, site, page, on_screenshot, on_request
+            )
+            outlinks.update(browser_outlinks)
+        except brozzler.PageInterstitialShown:
+            self.logger.info("page interstitial shown (http auth): %s", page)
+
+        if enable_youtube_dl and ydl.should_ytdlp(page):
             try:
-                ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
+                ydl_outlinks = ydl.do_youtube_dl(self, site, page)
+                outlinks.update(ydl_outlinks)
             except brozzler.ReachedLimit as e:
                 raise
             except brozzler.ShutdownRequested:
@@ -270,23 +279,6 @@ def brozzle_page(
                     self.logger.error(
                         "youtube_dl raised exception on %s", page, exc_info=True
                     )
-
-        if self._needs_browsing(page, ydl_fetches):
-            self.logger.info("needs browsing: %s", page)
-            try:
-                browser_outlinks = self._browse_page(
-                    browser, site, page, on_screenshot, on_request
-                )
-                outlinks.update(browser_outlinks)
-            except brozzler.PageInterstitialShown:
-                self.logger.info("page interstitial shown (http auth): %s", page)
-        else:
-            if not self._already_fetched(page, ydl_fetches):
-                self.logger.info("needs fetch: %s", page)
-                self._fetch_url(site, page=page)
-            else:
-                self.logger.info("already fetched: %s", page)
-
         return outlinks
 
     def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
@@ -415,28 +407,6 @@ def _fetch_url(self, site, url=None, page=None):
         except requests.exceptions.ProxyError as e:
             raise brozzler.ProxyError("proxy error fetching %s" % url) from e
 
-    def _needs_browsing(self, page, ydl_fetches):
-        if ydl_fetches:
-            final_bounces = ydl.final_bounces(ydl_fetches, page.url)
-            if not final_bounces:
-                return True
-            for txn in final_bounces:
-                if txn["response_headers"].get_content_type() in [
-                    "text/html",
-                    "application/xhtml+xml",
-                ]:
-                    return True
-            return False
-        else:
-            return True
-
-    def _already_fetched(self, page, ydl_fetches):
-        if ydl_fetches:
-            for fetch in ydl.final_bounces(ydl_fetches, page.url):
-                if fetch["method"] == "GET" and fetch["response_code"] == 200:
-                    return True
-        return False
-
     def brozzle_site(self, browser, site):
         try:
             site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
@@ -1,7 +1,7 @@
 """
 brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
 
-Copyright (C) 2023 Internet Archive
+Copyright (C) 2024 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 from yt_dlp.utils import match_filter_func
 import brozzler
 import urllib.request
+from urllib.parse import urlparse
 import tempfile
 import urlcanon
 import os
@@ -32,6 +33,33 @@
 thread_local = threading.local()
 
 
+def is_html_maybe(url):
+    if "chrome-error:" in url:
+        return False
+
+    skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp3", "mp4", "mpeg", "css", "js"]
+
+    parsed_url = urlparse(url)
+    base_url, ext = os.path.splitext(parsed_url.path)
+    ext = ext[1:]
+    for skip in skip_url_exts:
+        if ext.startswith(skip):
+            return False
+    return True
+
+
+def should_ytdlp(page):
+    ytdlp_url = page.redirect_url if page.redirect_url else page.url
+
+    if not is_html_maybe(ytdlp_url):
+        logging.warning(
+            "skipping yt-dlp for %s due to unsupported extension", ytdlp_url
+        )
+        return False
+
+    return True
+
+
 class ExtraHeaderAdder(urllib.request.BaseHandler):
     def __init__(self, extra_headers):
         self.extra_headers = extra_headers
@@ -67,35 +95,6 @@ def reset(self):
         self.fetches = []
 
 
-def final_bounces(fetches, url):
-    """
-    Resolves redirect chains in `fetches` and returns a list of fetches
-    representing the final redirect destinations of the given url. There could
-    be more than one if for example youtube-dl hit the same url with HEAD and
-    then GET requests.
-    """
-    redirects = {}
-    for fetch in fetches:
-        # XXX check http status 301,302,303,307? check for "uri" header
-        # as well as "location"? see urllib.request.HTTPRedirectHandler
-        if "location" in fetch["response_headers"]:
-            redirects[fetch["url"]] = fetch
-
-    final_url = url
-    while final_url in redirects:
-        fetch = redirects.pop(final_url)
-        final_url = urllib.parse.urljoin(
-            fetch["url"], fetch["response_headers"]["location"]
-        )
-
-    final_bounces = []
-    for fetch in fetches:
-        if fetch["url"] == final_url:
-            final_bounces.append(fetch)
-
-    return final_bounces
-
-
 def _build_youtube_dl(worker, destdir, site, page):
     """
     Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
@@ -183,8 +182,8 @@ def _push_video_to_warcprox(self, site, info_dict, postprocessor):
             else:
                 url = info_dict.get("url", "")
 
-            # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
-            if url.endswith(".m3u8") or url == "":
+            # skip urls containing .m3u8, to avoid duplicates handled by FixupM3u8
+            if url == "" or ".m3u8" in url:
                 return
 
             size = os.path.getsize(info_dict["filepath"])
@@ -344,28 +343,29 @@ def _remember_videos(page, fetches, pushed_videos=None):
 
 
 def _try_youtube_dl(worker, ydl, site, page):
+    ytdlp_url = page.redirect_url if page.redirect_url else page.url
     try:
-        logging.info("trying yt-dlp on %s", page)
+        logging.info("trying yt-dlp on %s", ytdlp_url)
 
         with brozzler.thread_accept_exceptions():
             # we do whatwg canonicalization here to avoid "<urlopen error
             # no host given>" resulting in ProxyError
             # needs automated test
             # and yt-dlp needs sanitize_info for extract_info
             ie_result = ydl.sanitize_info(
-                ydl.extract_info(str(urlcanon.whatwg(page.url)))
+                ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)))
             )
         _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
         if worker._using_warcprox(site):
             info_json = json.dumps(ie_result, sort_keys=True, indent=4)
             logging.info(
                 "sending WARCPROX_WRITE_RECORD request to warcprox "
                 "with yt-dlp json for %s",
-                page,
+                ytdlp_url,
             )
             worker._warcprox_write_record(
                 warcprox_address=worker._proxy_for(site),
-                url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
+                url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)),
                 warc_type="metadata",
                 content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                 payload=info_json.encode("utf-8"),
@@ -391,7 +391,7 @@ def _try_youtube_dl(worker, ydl, site, page):
         ):
             # connection problem when using a proxy == proxy error (XXX?)
             raise brozzler.ProxyError(
-                "yt-dlp hit apparent proxy error from " "%s" % page.url
+                "yt-dlp hit apparent proxy error from " "%s" % ytdlp_url
             ) from e
         else:
             raise
@@ -408,15 +408,7 @@ def do_youtube_dl(worker, site, page):
         page (brozzler.Page): the page we are brozzling
 
     Returns:
-        tuple with two entries:
-            `list` of `dict`: with info about urls fetched:
-                [{
-                    'url': ...,
-                    'method': ...,
-                    'response_code': ...,
-                    'response_headers': ...,
-                }, ...]
-            `list` of `str`: outlink urls
+         `list` of `str`: outlink urls
     """
     with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
         ydl = _build_youtube_dl(worker, tempdir, site, page)
@@ -431,5 +423,5 @@ def do_youtube_dl(worker, site, page):
                 "https://www.youtube.com/watch?v=%s" % e["id"]
                 for e in ie_result.get("entries_no_dl", [])
             }
-        # any outlinks for other cases?
-        return ydl.fetch_spy.fetches, outlinks
+        # any outlinks for other cases? soundcloud, maybe?
+        return outlinks