From 15c035c545f3040594a7022cecce2c924fa75eb3 Mon Sep 17 00:00:00 2001 From: LaRita Robinson Date: Mon, 5 Jun 2023 11:28:59 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=81=20Add=20derivative=5Frodeo=5Fsplit?= =?UTF-8?q?ter=20(#250)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🎁 Add derivative_rodeo_splitter Add a new PDF splitter option that wraps the DerivateRodeo's PdfSplitGenerator. It handles, in theory, PDF splitting and the derivative's generated in the DerivativeRodeo. Related to: - https://github.com/scientist-softserv/iiif_print/issues/220 Co-authored-by: Shana Moore Co-authored-by: Jeremy Friesen --- .../iiif_print/derivative_rodeo_service.rb | 112 ++++++++++++------ .../pluggable_derivative_service.rb | 2 +- docker-compose.yml | 4 +- iiif_print.gemspec | 2 +- lib/iiif_print.rb | 1 + lib/iiif_print/data/fileset_helper.rb | 2 +- .../jobs/child_works_from_pdf_job.rb | 19 ++- .../split_pdfs/derivative_rodeo_splitter.rb | 73 ++++++++++++ .../derivative_rodeo_splitter_spec.rb | 34 ++++++ .../derivative_rodeo_service_spec.rb | 14 +-- 10 files changed, 208 insertions(+), 55 deletions(-) create mode 100644 lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb create mode 100644 spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb index 7afe3d85..448037e9 100644 --- a/app/services/iiif_print/derivative_rodeo_service.rb +++ b/app/services/iiif_print/derivative_rodeo_service.rb @@ -25,43 +25,50 @@ class DerivativeRodeoService class_attribute :parent_work_identifier_property_name, default: 'aark_id' ## - # @attr input_location_adapter_name [String] The name of a derivative rodeo storage location; + # @attr preprocessed_location_adapter_name [String] The name of a derivative rodeo storage location; # this will must be a registered with the DerivativeRodeo::StorageLocations::BaseLocation. - class_attribute :input_location_adapter_name, default: 's3' + class_attribute :preprocessed_location_adapter_name, default: 's3' ## - # @attr named_derivatives_and_generators_by_type [Hash] the named derivative and it's - # associated generator. The "name" is important for Hyrax. The generator is one that - # exists in the DerivativeRodeo. + # @attr named_derivatives_and_generators_by_type [Hash] the named + # derivative and it's associated generator. The "name" is important for Hyrax or IIIF + # Print implementations. The generator is one that exists in the DerivativeRodeo. # # TODO: Could be nice to have a registry for the DerivativeRodeo::Generators; but that's a # tomorrow wish. class_attribute(:named_derivatives_and_generators_by_type, default: { - pdf: { thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator" } + pdf: { thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator" }, + image: { + thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator", + json: "DerivativeRodeo::Generator::WordCoordinatesGenerator", + xml: "DerivativeRodeo::Generator::AltoGenerator", + txt: "DerivativeRodeo::Generator::PlainTextGenerator" + } }) # @!endgroup Class Attributes ## ## - # This method "hard-codes" some existing assumptions about the input_uri based on - # implementations for Adventist. Those are reasonable assumptions but time will tell how - # reasonable. + # This method encodes some existing assumptions about the URI based on implementations for + # Adventist. Those are reasonable assumptions but time will tell how reasonable. + # + # By convention, this method is returning output_location of the SpaceStone::Serverless + # processing. We might know the original location that SpaceStone::Serverless processed, but + # that seems to be a tenuous assumption. + # + # In other words, where would SpaceStone, by convention, have written the original file and by + # convention written that original file's derivatives. + # + # TODO: We also need to account for PDF splitting # # @param file_set [FileSet] + # @param filename [String] + # @param extension [String] + # # @return [String] - def self.derivative_rodeo_input_uri(file_set:) - return @derivative_rodeo_input_uri if defined?(@derivative_rodeo_input_uri) - - # TODO: URGENT For a child work (e.g. an image split off of a PDF) we will know that the file_set's - # parent is a child, and the rules of the URI for those derivatives are different from the - # original ingested PDF or the original ingested Image. - - # TODO: This logic will work for an attached PDF; but not for each of the split pages of that - # PDF. How to do that? - - # TODO: This is duplicated logic for another service, consider extracting a helper module; - # better yet wouldn't it be nice if Hyrax did this right and proper. - parent = file_set.parent || file_set.member_of.find(&:work?) + # rubocop:disable Metrics/MethodLength + def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil) + parent = IiifPrint.parent_for(file_set) raise IiifPrint::DataError, "Parent not found for #{file_set.class} ID=#{file_set.id}" unless parent dirname = parent.public_send(parent_work_identifier_property_name) @@ -69,22 +76,42 @@ def self.derivative_rodeo_input_uri(file_set:) # TODO: This is a hack that knows about the inner workings of Hydra::Works, but for # expendiency, I'm using it. See # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/services/add_file_to_file_set.rb#L49-L53 - # TODO: Could we get away with filename that is passed in the create_derivatives process? - filename = Hydra::Works::DetermineOriginalName.call(file_set.original_file) + + # These filename, basename, extension name is here to allow for us to take an original file + # and see if we've pre-processed the derivative file. In the pre-processed derivative case, + # that would mean we have a different extension than the original. + filename ||= Hydra::Works::DetermineOriginalName.call(file_set.original_file) + extension ||= File.extname(filename) + extension = ".#{extension}" unless extension.start_with?(".") + basename = File.basename(filename, extension) # TODO: What kinds of exceptions might we raise if the location is not configured? Do we need # to "validate" it in another step. - location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(input_location_adapter_name) + location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(preprocessed_location_adapter_name) - # TODO: This is based on the provided output template in - # https://github.com/scientist-softserv/space_stone-serverless/blob/0dbe2b6fa13b9f4bf8b9580ec14d0af5c98e2b00/awslambda/bin/sample_post.rb#L1 - # and is very much hard-coded. We likely want to "store" the template in a common place for - # the application. + # Is the given filename one that is of the format of a file that was split off from a PDF. # - # s3://s3-antics/:aark_id/:file_name_with_extension - # s3://s3-antics/12345/hello-world.pdf - @derivative_rodeo_input_uri = File.join(location.adapter_prefix, dirname, File.basename(filename)) + # In this case the basename will include the "filename--page-\d.extension" type format where + # "\d" is a digit. + if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename) + # Note in this case the basename will have a suffix of "--page-" + # + # https://github.com/scientist-softserv/derivative_rodeo/blob/de8ab3993cc9d8719f70c6e990867ceb37d1dfd8/lib/derivative_rodeo/generators/pdf_split_generator.rb#L19-L56 + parent_pdf_file_basename = basename.sub(%r{--page-\d+$}) + page_basename = basename + File.join(location.adapter_prefix, dirname, parent_pdf_file_basename, "pages", "#{page_basename}#{extension}") + else + # TODO: This is based on the provided output template in + # https://github.com/scientist-softserv/space_stone-serverless/blob/0dbe2b6fa13b9f4bf8b9580ec14d0af5c98e2b00/awslambda/bin/sample_post.rb#L1 + # and is very much hard-coded. We likely want to "store" the template in a common place for + # the application. + # + # s3://s3-antics/:aark_id/:file_name_with_extension + # s3://s3-antics/12345/hello-world.pdf + File.join(location.adapter_prefix, dirname, "#{basename}#{extension}") + end end + # rubocop:enable Metrics/MethodLength def initialize(file_set) @file_set = file_set @@ -125,14 +152,25 @@ def create_derivatives(filename) private - def lasso_up_some_derivatives(type:, **) + def lasso_up_some_derivatives(type:, filename:) # TODO: Can we use the filename instead of the antics of the original_file on the file_set? # We have the filename in create_derivatives. named_derivatives_and_generators_by_type.fetch(type).flat_map do |named_derivative, generator_name| # This is the location that Hyrax expects us to put files that will be added to Fedora. output_location_template = "file://#{Hyrax::DerivativePath.derivative_path_for_reference(file_set, named_derivative)}" + + # The generator knows the output extensions. generator = generator_name.constantize - generator.new(input_uris: [derivative_rodeo_input_uri], output_location_template: output_location_template).generate_uris + + # This is the location where we hope the derivative rodeo will have generated the derived + # file (e.g. a PDF page's txt file or an image's thumbnail. + pre_processed_location_template = self.class.derivative_rodeo_uri(file_set: file_set, filename: filename, extension: generator.output_extension) + + generator.new( + input_uris: [input_uri], + pre_processed_location_template: pre_processed_location_template, + output_location_template: output_location_template + ).generated_files end end @@ -141,8 +179,8 @@ def supported_mime_types named_derivatives_and_generators_by_type.keys.flat_map { |type| file_set.class.public_send("#{type}_mime_types") } end - def derivative_rodeo_input_uri - @derivative_rodeo_input_uri ||= self.class.derivative_rodeo_input_uri(file_set: file_set) + def input_uri + @input_uri ||= self.class.derivative_rodeo_uri(file_set: file_set) end def in_the_rodeo? @@ -150,7 +188,7 @@ def in_the_rodeo? # for looking in the rodeo. return false unless supported_mime_types.include?(mime_type) - location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(derivative_rodeo_input_uri) + location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(input_uri) location.exist? end end diff --git a/app/services/iiif_print/pluggable_derivative_service.rb b/app/services/iiif_print/pluggable_derivative_service.rb index fce11316..a909f677 100644 --- a/app/services/iiif_print/pluggable_derivative_service.rb +++ b/app/services/iiif_print/pluggable_derivative_service.rb @@ -39,7 +39,7 @@ def initialize(file_set, plugins: plugins_for(file_set)) # multiple plugins, some of which may or may not be valid, so # validity checks happen within as well. def valid? - !valid_plugins.size.zero? + !valid_plugins.empty? end # get derivative services relevant to method name and file_set context diff --git a/docker-compose.yml b/docker-compose.yml index 478c11a9..d2408785 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,12 +85,12 @@ services: environment: - VIRTUAL_PORT=3000 - VIRTUAL_HOST=.hyku.test - command: tail -f /dev/null + # command: tail -f /dev/null ## ## Similar to the above, except we will bundle and then tell the container ## to wait. You'll then need to bash into the web container to do much of ## anything. - # command: sh -l -c "bundle && echo \"Finished bundling now waiting...\" && tail -f /dev/null" + command: sh -l -c "bundle install && echo \"Finished bundling now waiting...\" && tail -f /dev/null" depends_on: db: condition: service_started diff --git a/iiif_print.gemspec b/iiif_print.gemspec index 3e31fb4f..7b142a14 100644 --- a/iiif_print.gemspec +++ b/iiif_print.gemspec @@ -23,7 +23,7 @@ SUMMARY spec.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR) spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) spec.add_dependency 'blacklight_iiif_search', '~> 1.0' - spec.add_dependency 'derivative-rodeo' + spec.add_dependency 'derivative-rodeo', "~> 0.3" spec.add_dependency 'dry-monads', '~> 1.4.0' spec.add_dependency 'hyrax', '>= 2.5', '< 4' spec.add_dependency 'nokogiri', '>=1.13.2' diff --git a/lib/iiif_print.rb b/lib/iiif_print.rb index 38e6f9d6..a9d164ea 100644 --- a/lib/iiif_print.rb +++ b/lib/iiif_print.rb @@ -19,6 +19,7 @@ require "iiif_print/jobs/child_works_from_pdf_job" require "iiif_print/split_pdfs/base_splitter" require "iiif_print/split_pdfs/child_work_creation_from_pdf_service" +require "iiif_print/split_pdfs/derivative_rodeo_splitter" module IiifPrint extend ActiveSupport::Autoload diff --git a/lib/iiif_print/data/fileset_helper.rb b/lib/iiif_print/data/fileset_helper.rb index d7d013aa..dc8c603e 100644 --- a/lib/iiif_print/data/fileset_helper.rb +++ b/lib/iiif_print/data/fileset_helper.rb @@ -7,7 +7,7 @@ def fileset_id # if context is itself a string, presume it is a file set id return @work if @work.is_a? String # if context is not a String, presume a work or fileset context: - fileset.nil? ? nil : fileset.id + fileset&.id end def first_fileset diff --git a/lib/iiif_print/jobs/child_works_from_pdf_job.rb b/lib/iiif_print/jobs/child_works_from_pdf_job.rb index fef883ce..e61b17b2 100644 --- a/lib/iiif_print/jobs/child_works_from_pdf_job.rb +++ b/lib/iiif_print/jobs/child_works_from_pdf_job.rb @@ -15,17 +15,19 @@ def perform(candidate_for_parency, pdf_paths, user, admin_set_id, *) # We know that we have cases where parent_work is nil, this will definitely raise an # exception; which is fine because we were going to do it later anyway. @parent_work = if candidate_for_parency.work? + pdf_file_set = nil candidate_for_parency else # We likely have a file set + pdf_file_set = candidate_for_parency IiifPrint.parent_for(candidate_for_parency) end @child_admin_set_id = admin_set_id child_model = @parent_work.iiif_print_config.pdf_split_child_model - # handle each input pdf + # handle each input pdf (when input is a file set, we will only have one). pdf_paths.each do |original_pdf_path| - split_pdf(original_pdf_path, user, child_model) + split_pdf(original_pdf_path, user, child_model, pdf_file_set) end # Link newly created child works to the parent @@ -47,10 +49,8 @@ def perform(candidate_for_parency, pdf_paths, user, admin_set_id, *) private # rubocop:disable Metrics/ParameterLists - def split_pdf(original_pdf_path, user, child_model) - # TODO: This is the place to change out the existing service and instead use the derivative - # rodeo; we will likely need to look at method signatures to tighten this interface. - image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path) + def split_pdf(original_pdf_path, user, child_model, pdf_file_set) + image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path, file_set: pdf_file_set) return if image_files.blank? prepare_import_data(original_pdf_path, image_files, user) @@ -97,6 +97,13 @@ def prepare_import_data(original_pdf_path, image_files, user) PendingRelationship.create!(child_title: child_title, parent_id: @parent_work.id, child_order: child_title) + + begin + # Clean up the temporary image path. + File.rm_f(image_path) if File.exist?(image_path) + rescue + # If we can't delete, let's move on. Maybe it was already cleaned-up. + end end end # rubocop:enable Metrics/MethodLength diff --git a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb new file mode 100644 index 00000000..b7c2817a --- /dev/null +++ b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb @@ -0,0 +1,73 @@ +module IiifPrint + module SplitPdfs + ## + # This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed + # images, or split a PDF if there are no preprocessed images. + # + # We have already attached the original file to the file_set. We want to convert that original + # file that's attached to a input_uri (e.g. "file://path/to/original-file" as in what we have + # written to Fedora as the PDF) + # + # @see .call + class DerivativeRodeoSplitter + ## + # @param filename [String] the local path to the PDFDerivativeServicele + # @param file_set [FileSet] file set containing the PDF file to split + # + # @return [Array] paths to images split from each page of PDF file + def self.call(filename, file_set:) + new(filename, file_set: file_set).split_files + end + + def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir) + @input_uri = "file://#{filename}" + + # We are writing the images to a local location that CarrierWave can upload. This is a + # local file, internal to IiifPrint; it looks like SpaceStone/DerivativeRodeo lingo, but + # that's just a convenience. + output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}') + + @output_location_template = "file://#{output_template_path}" + @preprocessed_location_template = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename) + end + + ## + # This is where, in "Fedora" we have the original file. This is not the original file in the + # pre-processing location but instead the long-term location of the file in the application + # that mounts IIIF Print. + # + # @return [String] + attr_reader :input_uri + + ## + # This is the location where we're going to write the derivatives that will "go into Fedora"; + # it is a local location, one that IIIF Print's mounting application can directly do + # "File.read" + # + # @return [String] + attr_reader :output_location_template + + ## + # Where can we find, in the DerivativeRodeo's storage, what has already been done regarding + # derivative generation. + # + # For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3 + # bucket that we then use for IIIF Print. + # + # @return [String] + # + # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63 + attr_reader :preprocessed_location_template + + ## + # @return [Array] the paths to each of the images split off from the PDF. + def split_files + DerivativeRodeo::Generators::PdfSplitGenerator.new( + input_uris: [@input_uri], + output_location_template: output_location_template, + preprocessed_location_template: preprocessed_location_template + ).generated_files.map(&:file_path) + end + end + end +end diff --git a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb new file mode 100644 index 00000000..46333f37 --- /dev/null +++ b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe IiifPrint::SplitPdfs::DerivativeRodeoSplitter do + let(:path) { nil } + let(:work) { double(MyWork, aark_id: '12345') } + let(:file_set) { FileSet.new.tap { |fs| fs.save!(validate: false) } } + + describe 'class' do + subject { described_class } + + it { is_expected.to respond_to(:call) } + end + + describe "instance" do + subject { described_class.new(path, file_set: file_set) } + let(:generator) { double(DerivativeRodeo::Generators::PdfSplitGenerator, generated_files: []) } + + before do + allow(file_set).to receive(:parent).and_return(work) + # TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but + # this part is only one piece of the over all integration. + allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) + end + + it { is_expected.to respond_to :split_files } + + it 'uses the rodeo to split' do + expect(DerivativeRodeo::Generators::PdfSplitGenerator).to receive(:new).and_return(generator) + described_class.call(path, file_set: file_set) + end + end +end diff --git a/spec/services/iiif_print/derivative_rodeo_service_spec.rb b/spec/services/iiif_print/derivative_rodeo_service_spec.rb index 303127e3..d5acf79a 100644 --- a/spec/services/iiif_print/derivative_rodeo_service_spec.rb +++ b/spec/services/iiif_print/derivative_rodeo_service_spec.rb @@ -12,8 +12,8 @@ subject(:klass) { described_class } - describe '.input_location_adapter_name' do - subject { described_class.input_location_adapter_name } + describe '.preprocessed_location_adapter_name' do + subject { described_class.preprocessed_location_adapter_name } it { is_expected.to eq 's3' } end @@ -27,8 +27,8 @@ it { is_expected.to be_a Hash } end - describe '.derivative_rodeo_input_uri' do - subject { described_class.derivative_rodeo_input_uri(file_set: file_set) } + describe '.derivative_rodeo_uri' do + subject { described_class.derivative_rodeo_uri(file_set: file_set) } context 'when the file_set does not have a parent' do xit 'is expected to raise an error' do @@ -39,12 +39,12 @@ context 'when the file_set has a parent' do before do allow(file_set).to receive(:parent).and_return(work) - # TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but + # TODO: This is a hack that leverages the internals oof Hydra::Works; not excited about it but # this part is only one piece of the over all integration. allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) end - it { is_expected.to start_with("#{described_class.input_location_adapter_name}://") } + it { is_expected.to start_with("#{described_class.preprocessed_location_adapter_name}://") } it { is_expected.to end_with(File.basename(__FILE__)) } end end @@ -64,7 +64,7 @@ end context 'when derivative rodeo has not pre-processed the file set' do - before { instance.input_location_adapter_name = "file" } + before { instance.preprocessed_location_adapter_name = "file" } let(:mime_type) { "application/pdf" } it { is_expected.to be_falsey }