Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

run yt-dlp after brozzling a page (if at all) #276

Merged
merged 13 commits into from
Apr 24, 2024
56 changes: 13 additions & 43 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier

Copyright (C) 2014-2023 Internet Archive
Copyright (C) 2014-2024 Internet Archive

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -242,11 +242,20 @@ def brozzle_page(
enable_youtube_dl=True,
):
self.logger.info("brozzling {}".format(page))
ydl_fetches = None
outlinks = set()
if enable_youtube_dl and not page.url.lower().endswith(".pdf"):

try:
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request
)
outlinks.update(browser_outlinks)
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)

if enable_youtube_dl and ydl.should_ytdlp(page):
try:
ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
Expand All @@ -270,23 +279,6 @@ def brozzle_page(
self.logger.error(
"youtube_dl raised exception on %s", page, exc_info=True
)

if self._needs_browsing(page, ydl_fetches):
self.logger.info("needs browsing: %s", page)
try:
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request
)
outlinks.update(browser_outlinks)
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)
else:
if not self._already_fetched(page, ydl_fetches):
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
else:
self.logger.info("already fetched: %s", page)

return outlinks

def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
Expand Down Expand Up @@ -415,28 +407,6 @@ def _fetch_url(self, site, url=None, page=None):
except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e

def _needs_browsing(self, page, ydl_fetches):
if ydl_fetches:
final_bounces = ydl.final_bounces(ydl_fetches, page.url)
if not final_bounces:
return True
for txn in final_bounces:
if txn["response_headers"].get_content_type() in [
"text/html",
"application/xhtml+xml",
]:
return True
return False
else:
return True

def _already_fetched(self, page, ydl_fetches):
if ydl_fetches:
for fetch in ydl.final_bounces(ydl_fetches, page.url):
if fetch["method"] == "GET" and fetch["response_code"] == 200:
return True
return False

def brozzle_site(self, browser, site):
try:
site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
Expand Down
88 changes: 40 additions & 48 deletions brozzler/ydl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler

Copyright (C) 2023 Internet Archive
Copyright (C) 2024 Internet Archive

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,7 @@
from yt_dlp.utils import match_filter_func
import brozzler
import urllib.request
from urllib.parse import urlparse
import tempfile
import urlcanon
import os
Expand All @@ -32,6 +33,33 @@
thread_local = threading.local()


def is_html_maybe(url):
if "chrome-error:" in url:
return False

skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp3", "mp4", "mpeg", "css", "js"]
galgeek marked this conversation as resolved.
Show resolved Hide resolved

parsed_url = urlparse(url)
base_url, ext = os.path.splitext(parsed_url.path)
ext = ext[1:]
for skip in skip_url_exts:
if ext.startswith(skip):
return False
return True


def should_ytdlp(page):
ytdlp_url = page.redirect_url if page.redirect_url else page.url

if not is_html_maybe(ytdlp_url):
logging.warning(
"skipping yt-dlp for %s due to unsupported extension", ytdlp_url
)
return False

return True


class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
self.extra_headers = extra_headers
Expand Down Expand Up @@ -67,35 +95,6 @@ def reset(self):
self.fetches = []


def final_bounces(fetches, url):
"""
Resolves redirect chains in `fetches` and returns a list of fetches
representing the final redirect destinations of the given url. There could
be more than one if for example youtube-dl hit the same url with HEAD and
then GET requests.
"""
redirects = {}
for fetch in fetches:
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if "location" in fetch["response_headers"]:
redirects[fetch["url"]] = fetch

final_url = url
while final_url in redirects:
fetch = redirects.pop(final_url)
final_url = urllib.parse.urljoin(
fetch["url"], fetch["response_headers"]["location"]
)

final_bounces = []
for fetch in fetches:
if fetch["url"] == final_url:
final_bounces.append(fetch)

return final_bounces


def _build_youtube_dl(worker, destdir, site, page):
"""
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
Expand Down Expand Up @@ -183,8 +182,8 @@ def _push_video_to_warcprox(self, site, info_dict, postprocessor):
else:
url = info_dict.get("url", "")

# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
if url.endswith(".m3u8") or url == "":
# skip urls containing .m3u8, to avoid duplicates handled by FixupM3u8
if url == "" or ".m3u8" in url:
return

size = os.path.getsize(info_dict["filepath"])
Expand Down Expand Up @@ -344,28 +343,29 @@ def _remember_videos(page, fetches, pushed_videos=None):


def _try_youtube_dl(worker, ydl, site, page):
ytdlp_url = page.redirect_url if page.redirect_url else page.url
try:
logging.info("trying yt-dlp on %s", page)
logging.info("trying yt-dlp on %s", ytdlp_url)

with brozzler.thread_accept_exceptions():
# we do whatwg canonicalization here to avoid "<urlopen error
# no host given>" resulting in ProxyError
# needs automated test
# and yt-dlp needs sanitize_info for extract_info
ie_result = ydl.sanitize_info(
ydl.extract_info(str(urlcanon.whatwg(page.url)))
ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)))
)
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json for %s",
page,
ytdlp_url,
)
worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
Expand All @@ -391,7 +391,7 @@ def _try_youtube_dl(worker, ydl, site, page):
):
# connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError(
"yt-dlp hit apparent proxy error from " "%s" % page.url
"yt-dlp hit apparent proxy error from " "%s" % ytdlp_url
) from e
else:
raise
Expand All @@ -408,15 +408,7 @@ def do_youtube_dl(worker, site, page):
page (brozzler.Page): the page we are brozzling

Returns:
tuple with two entries:
`list` of `dict`: with info about urls fetched:
[{
'url': ...,
'method': ...,
'response_code': ...,
'response_headers': ...,
}, ...]
`list` of `str`: outlink urls
`list` of `str`: outlink urls
"""
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
ydl = _build_youtube_dl(worker, tempdir, site, page)
Expand All @@ -431,5 +423,5 @@ def do_youtube_dl(worker, site, page):
"https://www.youtube.com/watch?v=%s" % e["id"]
for e in ie_result.get("entries_no_dl", [])
}
# any outlinks for other cases?
return ydl.fetch_spy.fetches, outlinks
# any outlinks for other cases? soundcloud, maybe?
return outlinks
Loading