From fcebbc3deb06a614ddf8597d3dd30d91cafdce1d Mon Sep 17 00:00:00 2001 From: Morgan Aldridge Date: Mon, 20 May 2024 09:57:49 -0400 Subject: [PATCH] Added new '-w'/'--wait' option, accepting a number of seconds to sleep/wait between requests, plus a '--wait-random' option which will randomize the number of wait seconds by a 0.5x-2x. These options are used by the new WaybackMachineDownloader#wait method which is called during subsequent requests. Issue #1 --- bin/wayback_machine_downloader | 8 ++++++-- lib/wayback_machine_downloader.rb | 10 +++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 8b9f2fd..c6a1dbd 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -34,8 +34,12 @@ option_parser = OptionParser.new do |opts| options[:exact_url] = t end - opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| - options[:only_filter] = t + opts.on("-w", "--wait SECONDS", Integer, "Wait the specified number of seconds between requests") do |t| + options[:wait_seconds] = t + end + + opts.on("--random-wait", "When used with --wait, randomize number of seconds waited between requests by a factor of 0.5 to 2") do |t| + options[:wait_randomize] = true end opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 35ff36d..a52fb58 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -18,7 +18,7 @@ class WaybackMachineDownloader attr_accessor :base_url, :exact_url, :directory, :all_timestamps, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, - :all, :maximum_pages, :threads_count + :all, :maximum_pages, :threads_count, :wait_seconds, :wait_randomized def initialize params @base_url = params[:base_url] @@ -32,6 +32,8 @@ def initialize params @all = params[:all] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i + @wait_seconds = params[:wait_seconds].to_i + @wait_randomized = params[:wait_randomized] end def backup_name @@ -89,6 +91,7 @@ def get_all_snapshots_to_consider print "." unless @exact_url @maximum_pages.times do |page_index| + wait snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) break if snapshot_list.empty? snapshot_list_to_consider += snapshot_list @@ -208,6 +211,7 @@ def download_files @threads_count.times do threads << Thread.new do until file_queue.empty? + wait file_remote_info = file_queue.pop(true) rescue nil download_file(file_remote_info) if file_remote_info end @@ -313,4 +317,8 @@ def file_list_by_timestamp def semaphore @semaphore ||= Mutex.new end + + def wait + @wait_seconds.positive? && @wait_randomized ? sleep(@wait_seconds.to_f * (rand(1.5) + 0.5)) : sleep(@wait_seconds) + end end