Skip to content

Commit

Permalink
☑️ Verifying pdf splitter finds pre-existing files
Browse files Browse the repository at this point in the history
Updating a bit of documentation and reworking the filename to account
for a work having multiple PDFs.

- https://github.com/scientist-softserv/adventist-dl/issues/330
- scientist-softserv/iiif_print#220
  • Loading branch information
jeremyf committed May 30, 2023
1 parent c62729f commit a45e57f
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 9 deletions.
17 changes: 11 additions & 6 deletions lib/derivative_rodeo/generators/pdf_split_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ class PdfSplitGenerator < BaseGenerator
#
# @see #existing_page_locations
def image_file_basename_template(basename:)
# We can do this because the temp files are always local; and we'll need to modify how we
# write these files.
"pages/#{basename}-%d.#{output_extension}"
"#{basename}/pages/#{basename}-%d.#{output_extension}"
end

##
Expand All @@ -45,10 +43,12 @@ def image_file_basename_template(basename:)
# with :tail_glob.
#
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
#
# @note The tail_glob is in relation to the {#image_file_basename_template}
def existing_page_locations(input_location:)
# TODO: Are we adequately accounting for the directory structure necessary to have a work have
# more than one PDF and then split each PDF's pages into the correct sub directory?
tail_glob = "pages/*.#{output_extension}"
# See image_file_basename_template
tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"

output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
return output_locations if output_locations.count.positive?

Expand All @@ -69,7 +69,12 @@ def existing_page_locations(input_location:)
# @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
# @yieldparam image_path [String] where to find this file in the tmp space
#
# @note This function makes a concession; namely that if it encounters any
# {#existing_page_locations} it will use all of that result as the entire number of pages.
# We could make this smarter but at the moment we're deferring on that.
#
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
#
# rubocop:disable Metrics/MethodLength
def with_each_requisite_location_and_tmp_file_path
input_files.each do |input_location|
Expand Down
27 changes: 27 additions & 0 deletions spec/derivative_rodeo/generators/pdf_split_generator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,33 @@
end

describe '#generated_files' do
context 'when given an already split PDF' do
it 'uses the already split components' do
Fixtures.with_file_uris_for("minimal-2-page.pdf") do |input_uris|
Fixtures.with_temporary_directory do |output_temporary_path|
output_location_template = "file://#{output_temporary_path}/{{ dir_parts[-1..-1] }}/{{ filename }}"
instance = described_class.new(input_uris: input_uris, output_location_template: output_location_template)
output_location = DerivativeRodeo::StorageLocations::FileLocation.build(from_uri: input_uris.first, template: output_location_template)

# Let's fake a nice TIFF being in a pre-processed location.
pre_existing_tiff_path = File.join(output_location.file_dir, output_location.file_basename, "pages/1.tiff")
FileUtils.mkdir_p(File.dirname(pre_existing_tiff_path))
File.open(pre_existing_tiff_path, "w+") do |f|
f.puts "🤠🐮🐴 A muppet man parading as a TIFF."
end

generated_files = instance.generated_files
# TODO: The PDF is two pages yet we only check for the presence of one
# or more derived files; hence our faked pre-processed derivative is all that we find.
expect(generated_files.size).to eq(1)

# Ensuring that we do in fact have the pre-made file.
expect(File.read(generated_files.first.file_path)).to start_with("🤠🐮🐴")
end
end
end
end

context 'when given a PDF to split' do
it 'will create one image per page, writing that to the storage adapter, and then enqueue each page for processing' do
generated_files = nil
Expand Down
4 changes: 1 addition & 3 deletions spec/derivative_rodeo/storage_locations/s3_location_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,9 @@

describe '#globbed_tail_locations' do
it 'searched the bucket' do
# The subject's bucket is not the same as the above bucket
subject.bucket = bucket
basename_ish = short_path.split(".").first
key = File.join(basename_ish, File.basename(__FILE__))
bucket.object(key).upload_file(__FILE__)
subject.bucket.object(key).upload_file(__FILE__)

subject.globbed_tail_locations(tail_glob: "*.rb")
end
Expand Down

0 comments on commit a45e57f

Please sign in to comment.