Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

brozzler yt-dlp should be able to specify a separate tempdir #307

Merged
merged 5 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions brozzler/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,7 @@ def browse_page(
skip_extract_outlinks=False,
skip_visit_hashtags=False,
skip_youtube_dl=False,
ytdlp_tmpdir="/tmp",
simpler404=False,
page_timeout=300,
behavior_timeout=900,
Expand Down
14 changes: 14 additions & 0 deletions brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,12 @@ def brozzle_page(argv=None):
arg_parser.add_argument(
"--skip-youtube-dl", dest="skip_youtube_dl", action="store_true"
)
arg_parser.add_argument(
"--ytdlp_tmpdir",
dest="ytdlp_tmpdir",
default="/tmp",
help="specify a temp dir for ytdlp; defaults to /tmp",
)
arg_parser.add_argument("--simpler404", dest="simpler404", action="store_true")
add_common_options(arg_parser, argv)

Expand Down Expand Up @@ -292,6 +298,7 @@ def brozzle_page(argv=None):
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl,
ytdlp_tmpdir=args.ytdlp_tmpdir,
simpler404=args.simpler404,
screenshot_full_page=args.screenshot_full_page,
download_throughput=args.download_throughput,
Expand Down Expand Up @@ -533,6 +540,12 @@ def brozzler_worker(argv=None):
action="store_true",
help=argparse.SUPPRESS,
)
arg_parser.add_argument(
"--ytdlp_tmpdir",
dest="ytdlp_tmpdir",
default="/tmp",
help="argparse.SUPPRESS",
)
arg_parser.add_argument(
"--stealth",
dest="stealth",
Expand Down Expand Up @@ -613,6 +626,7 @@ def get_skip_av_seeds():
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl,
ytdlp_tmpdir=args.ytdlp_tmpdir,
stealth=args.stealth,
metrics_port=args.metrics_port,
registry_url=args.registry_url,
Expand Down
3 changes: 3 additions & 0 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(
skip_extract_outlinks=False,
skip_visit_hashtags=False,
skip_youtube_dl=False,
ytdlp_tmpdir="/tmp",
simpler404=False,
screenshot_full_page=False,
page_timeout=300,
Expand All @@ -89,6 +90,7 @@ def __init__(
self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl
self._ytdlp_tmpdir = ytdlp_tmpdir
self._simpler404 = simpler404
self._screenshot_full_page = screenshot_full_page
self._page_timeout = page_timeout
Expand Down Expand Up @@ -445,6 +447,7 @@ def _on_service_worker_version_updated(chrome_msg):
skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl,
ytdlp_tmpdir=self._ytdlp_tmpdir,
simpler404=self._simpler404,
screenshot_full_page=self._screenshot_full_page,
page_timeout=self._page_timeout,
Expand Down
5 changes: 4 additions & 1 deletion brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,10 @@ def do_youtube_dl(worker, site, page):
Returns:
`list` of `str`: outlink urls
"""
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
with tempfile.TemporaryDirectory(
prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir
) as tempdir:
logging.info("tempdir for yt-dlp: %s", tempdir)
ydl = _build_youtube_dl(worker, tempdir, site, page)
ie_result = _try_youtube_dl(worker, ydl, site, page)
outlinks = set()
Expand Down
Loading