Skip to content

Commit

Permalink
Merge pull request #12 from scientist-softserv/i321-audit-iaids-script
Browse files Browse the repository at this point in the history
create script to audit preprocess status of IAIDs
  • Loading branch information
bkiahstroud authored Oct 10, 2024
2 parents 63f5b23 + edf822b commit ce5ad7e
Showing 1 changed file with 98 additions and 0 deletions.
98 changes: 98 additions & 0 deletions bin/audit_iaids.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env ruby

require 'dotenv'
require 'fileutils'
require 'json'
require 'pathname'
require 'ruby-progressbar'

Dotenv.load('.env.production')
load 'lib/space_stone.rb'

def fetch_filenames_from_ia(iaid, data)
ia_download_service = SpaceStone::IaDownload.new(id: iaid)
url = "#{ia_download_service.remote_file_link}/" # trailing slash matters

response = begin
HTTParty.get(url, headers: { 'Cookie' => ia_download_service.login_cookies })
rescue Errno::ECONNREFUSED
data['status'] = "ERROR -- Couldn't connect to IA"
false
end
return [] if response == false || response.body.nil? || response.body.empty?

page = Nokogiri::HTML(response.body)
nodeset = page.css('a[href]')
hrefs = nodeset.map { |element| element['href'] }
# %2F is a URL-encoded slash ("/")
hrefs.grep(/download.*\.jp2$/).map { |file_path| file_path.split('%2F').last }
end

def fetch_filenames_from_s3(prefix)
@bucket
.objects(prefix:)
.entries
.map { |path| path.key.split('/').last }
.sort
end

def basenames(paths)
paths.map { |p| p.split('.')[0] }
end

def log_status(data)
return if data['status'] == "ERROR -- Couldn't connect to IA"

data['status'] = if data['ia_files'].nil? || data['ia_files'].empty?
'ERROR -- No files found in IA'
elsif !data['missing_files'].empty? || !data['missing_ocr'].empty? || !data['missing_thumbnails'].empty?
'WARN'
else
'OK'
end
end

# ---

json_path = Pathname.new(ARGV[0])
raise "No file found at #{json_path}" unless json_path.exist?

hash = JSON.parse(File.read(json_path))
@bucket = SpaceStone::S3Service.bucket
puts "\nTrack progress by tailing tmp/audit_debug.log\n\n"
progressbar = ProgressBar.create(total: hash.keys.size, format: '%a %e %P% Processed: %c from %C')

begin
hash.each do |iaid, data|
data['last_checked'] = DateTime.now.strftime('%Y-%m-%dT%H:%M:%S')

if !data.key?('ia_files') || data['ia_files'].empty?
data['ia_files'] = fetch_filenames_from_ia(iaid, data)
end

data['s3_files'] = fetch_filenames_from_s3("#{iaid}/downloads")
data['s3_ocr'] = fetch_filenames_from_s3("#{iaid}/ocr")
data['s3_thumbnails'] = fetch_filenames_from_s3("#{iaid}/thumbnails")

data['missing_files'] = basenames(data['ia_files']) - basenames(data['s3_files'])
data['missing_ocr'] = basenames(data['ia_files']) - basenames(data['s3_ocr'])
data['missing_thumbnails'] = basenames(data['ia_files']) - basenames(data['s3_thumbnails'])

log_status(data)
progressbar.increment
File.open('tmp/audit_debug.log', 'a') do |f|
f.puts "#{iaid} -- #{data['status']}"
end
# progressbar.increment
end
ensure
puts "\nBacking up existing data to #{json_path}.bak"
FileUtils.cp(json_path, "#{json_path}.bak")

File.open(json_path, 'w') do |file|
# file.puts JSON.pretty_generate(JSON.parse(hash.to_json))
file.puts hash.to_json
end

puts "\nOutput: #{json_path}"
end

0 comments on commit ce5ad7e

Please sign in to comment.