From 6ce7d884417a0f6ab12a0e854aef88098061244a Mon Sep 17 00:00:00 2001 From: Jeremy Friesen Date: Wed, 24 May 2023 14:52:04 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=81=20Adding=20IiifPrint::DerivativeRo?= =?UTF-8?q?deoService?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding a class that implements that Hyrax::DerivativeService and makes the necessary assumptions about how we are creating derivatives using the rodeo. There are lots of comments and todos that are a short-cut. Related to: - https://github.com/scientist-softserv/iiif_print/issues/219 - https://github.com/scientist-softserv/iiif_print/pull/243 --- .../iiif_print/derivative_rodeo_service.rb | 157 ++++++++++++++++++ lib/iiif_print/engine.rb | 1 + .../jobs/child_works_from_pdf_job.rb | 2 + .../derivative_rodeo_service_spec.rb | 85 ++++++++++ 4 files changed, 245 insertions(+) create mode 100644 app/services/iiif_print/derivative_rodeo_service.rb create mode 100644 spec/services/iiif_print/derivative_rodeo_service_spec.rb diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb new file mode 100644 index 00000000..7afe3d85 --- /dev/null +++ b/app/services/iiif_print/derivative_rodeo_service.rb @@ -0,0 +1,157 @@ +module IiifPrint + ## + # This class implements the interface of a Hyrax::DerivativeService. + # + # That means three important methods are: + # + # - {#valid?} + # - {#create_derivatives} + # - {#cleanup_derivatives} + # + # And the object initializes with a FileSet. + # + # It is a companion to {IiifPrint::PluggableDerivativeService}. + # + # @see https://github.com/samvera/hyrax/blob/main/app/services/hyrax/derivative_service.rb Hyrax::DerivativesService + class DerivativeRodeoService + ## + # @!group Class Attributes + # + # @attr parent_work_identifier_property_name [String] the property we use to identify the unique + # identifier of the parent work as it went through the SpaceStone pre-process. + # + # TODO: The default of :aark_id is a quick hack for adventist. By exposing a configuration + # value, my hope is that this becomes easier to configure. + class_attribute :parent_work_identifier_property_name, default: 'aark_id' + + ## + # @attr input_location_adapter_name [String] The name of a derivative rodeo storage location; + # this will must be a registered with the DerivativeRodeo::StorageLocations::BaseLocation. + class_attribute :input_location_adapter_name, default: 's3' + + ## + # @attr named_derivatives_and_generators_by_type [Hash] the named derivative and it's + # associated generator. The "name" is important for Hyrax. The generator is one that + # exists in the DerivativeRodeo. + # + # TODO: Could be nice to have a registry for the DerivativeRodeo::Generators; but that's a + # tomorrow wish. + class_attribute(:named_derivatives_and_generators_by_type, default: { + pdf: { thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator" } + }) + # @!endgroup Class Attributes + ## + + ## + # This method "hard-codes" some existing assumptions about the input_uri based on + # implementations for Adventist. Those are reasonable assumptions but time will tell how + # reasonable. + # + # @param file_set [FileSet] + # @return [String] + def self.derivative_rodeo_input_uri(file_set:) + return @derivative_rodeo_input_uri if defined?(@derivative_rodeo_input_uri) + + # TODO: URGENT For a child work (e.g. an image split off of a PDF) we will know that the file_set's + # parent is a child, and the rules of the URI for those derivatives are different from the + # original ingested PDF or the original ingested Image. + + # TODO: This logic will work for an attached PDF; but not for each of the split pages of that + # PDF. How to do that? + + # TODO: This is duplicated logic for another service, consider extracting a helper module; + # better yet wouldn't it be nice if Hyrax did this right and proper. + parent = file_set.parent || file_set.member_of.find(&:work?) + raise IiifPrint::DataError, "Parent not found for #{file_set.class} ID=#{file_set.id}" unless parent + + dirname = parent.public_send(parent_work_identifier_property_name) + + # TODO: This is a hack that knows about the inner workings of Hydra::Works, but for + # expendiency, I'm using it. See + # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/services/add_file_to_file_set.rb#L49-L53 + # TODO: Could we get away with filename that is passed in the create_derivatives process? + filename = Hydra::Works::DetermineOriginalName.call(file_set.original_file) + + # TODO: What kinds of exceptions might we raise if the location is not configured? Do we need + # to "validate" it in another step. + location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(input_location_adapter_name) + + # TODO: This is based on the provided output template in + # https://github.com/scientist-softserv/space_stone-serverless/blob/0dbe2b6fa13b9f4bf8b9580ec14d0af5c98e2b00/awslambda/bin/sample_post.rb#L1 + # and is very much hard-coded. We likely want to "store" the template in a common place for + # the application. + # + # s3://s3-antics/:aark_id/:file_name_with_extension + # s3://s3-antics/12345/hello-world.pdf + @derivative_rodeo_input_uri = File.join(location.adapter_prefix, dirname, File.basename(filename)) + end + + def initialize(file_set) + @file_set = file_set + end + + attr_reader :file_set + delegate :uri, :mime_type, to: :file_set + + ## + # @return + # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb#L18-L20 Hyrax::FileSetDerivativesService#valid? + def valid? + if in_the_rodeo? + Rails.logger.info("🤠🐮 Using the DerivativeRodeo for FileSet ID=#{file_set.id} with mime_type of #{mime_type}") + true + else + Rails.logger.info("Skipping the DerivativeRodeo for FileSet ID=#{file_set.id} with mime_type of #{mime_type}") + false + end + end + + ## + # @api public + # + # The file_set.class.*_mime_types are carried over from Hyrax. + def create_derivatives(filename) + # TODO: Do we need to handle "impending derivatives?" as per {IiifPrint::PluggableDerivativeService}? + case mime_type + when file_set.class.pdf_mime_types + lasso_up_some_derivatives(filename: filename, type: :pdf) + when file_set.class.image_mime_types + lasso_up_some_derivatives(filename: filename, type: :image) + else + # TODO: add more mime types but for now image and PDF are the two we accept. + raise "Unexpected mime_type #{mime_type} for filename #{filename}" + end + end + + private + + def lasso_up_some_derivatives(type:, **) + # TODO: Can we use the filename instead of the antics of the original_file on the file_set? + # We have the filename in create_derivatives. + named_derivatives_and_generators_by_type.fetch(type).flat_map do |named_derivative, generator_name| + # This is the location that Hyrax expects us to put files that will be added to Fedora. + output_location_template = "file://#{Hyrax::DerivativePath.derivative_path_for_reference(file_set, named_derivative)}" + generator = generator_name.constantize + generator.new(input_uris: [derivative_rodeo_input_uri], output_location_template: output_location_template).generate_uris + end + end + + def supported_mime_types + # If we've configured the rodeo + named_derivatives_and_generators_by_type.keys.flat_map { |type| file_set.class.public_send("#{type}_mime_types") } + end + + def derivative_rodeo_input_uri + @derivative_rodeo_input_uri ||= self.class.derivative_rodeo_input_uri(file_set: file_set) + end + + def in_the_rodeo? + # We can assume that we are not going to process a supported mime type; and there is a cost + # for looking in the rodeo. + return false unless supported_mime_types.include?(mime_type) + + location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(derivative_rodeo_input_uri) + location.exist? + end + end +end diff --git a/lib/iiif_print/engine.rb b/lib/iiif_print/engine.rb index b8273cf2..1330f20a 100644 --- a/lib/iiif_print/engine.rb +++ b/lib/iiif_print/engine.rb @@ -1,6 +1,7 @@ require 'active_fedora' require 'hyrax' require 'blacklight_iiif_search' +require 'derivative_rodeo' module IiifPrint # module constants: diff --git a/lib/iiif_print/jobs/child_works_from_pdf_job.rb b/lib/iiif_print/jobs/child_works_from_pdf_job.rb index d3e9825a..fef883ce 100644 --- a/lib/iiif_print/jobs/child_works_from_pdf_job.rb +++ b/lib/iiif_print/jobs/child_works_from_pdf_job.rb @@ -48,6 +48,8 @@ def perform(candidate_for_parency, pdf_paths, user, admin_set_id, *) # rubocop:disable Metrics/ParameterLists def split_pdf(original_pdf_path, user, child_model) + # TODO: This is the place to change out the existing service and instead use the derivative + # rodeo; we will likely need to look at method signatures to tighten this interface. image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path) return if image_files.blank? diff --git a/spec/services/iiif_print/derivative_rodeo_service_spec.rb b/spec/services/iiif_print/derivative_rodeo_service_spec.rb new file mode 100644 index 00000000..303127e3 --- /dev/null +++ b/spec/services/iiif_print/derivative_rodeo_service_spec.rb @@ -0,0 +1,85 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe IiifPrint::DerivativeRodeoService do + let(:work) { double({ described_class.parent_work_identifier_property_name => 'hello-1234-id' }) } + let(:file_set) { FileSet.new.tap { |fs| fs.save!(validate: false) } } + let(:generator) { DerivativeRodeo::Generators::CopyGenerator } + let(:output_extension) { "rb" } + + let(:instance) { described_class.new(file_set) } + + subject(:klass) { described_class } + + describe '.input_location_adapter_name' do + subject { described_class.input_location_adapter_name } + it { is_expected.to eq 's3' } + end + + describe '.parent_work_identifier_property_name' do + subject { described_class.parent_work_identifier_property_name } + it { is_expected.to be_a String } + end + + describe '.named_derivatives_and_generators_by_type' do + subject { described_class.named_derivatives_and_generators_by_type } + it { is_expected.to be_a Hash } + end + + describe '.derivative_rodeo_input_uri' do + subject { described_class.derivative_rodeo_input_uri(file_set: file_set) } + + context 'when the file_set does not have a parent' do + xit 'is expected to raise an error' do + expect { subject }.to raise_error(IiifPrint::DataError) + end + end + + context 'when the file_set has a parent' do + before do + allow(file_set).to receive(:parent).and_return(work) + # TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but + # this part is only one piece of the over all integration. + allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) + end + + it { is_expected.to start_with("#{described_class.input_location_adapter_name}://") } + it { is_expected.to end_with(File.basename(__FILE__)) } + end + end + + # TODO: Need Faux Bucket for Derivative Rodeo + xdescribe '#valid?' do + subject { instance.valid? } + + before do + allow(file_set).to receive(:mime_type).and_return(mime_type) + allow(file_set).to receive(:parent).and_return(work) + end + + context 'when the mime_type of the file is not supported' do + let(:mime_type) { "text/plain" } + it { is_expected.to be_falsey } + end + + context 'when derivative rodeo has not pre-processed the file set' do + before { instance.input_location_adapter_name = "file" } + + let(:mime_type) { "application/pdf" } + it { is_expected.to be_falsey } + end + + context 'when the mime type is supported and the derivative rodeo has pre-processed the file set' do + before do + # TODO: write to the rodeo; consider using AWS's spec support; I want to be able to "fake" S3 + # with a "fake" bucket. + # + # Dependent on https://github.com/scientist-softserv/derivative_rodeo/pull/37 + end + + let(:mime_type) { "application/pdf" } + it { is_expected.to be_truthy } + end + end +end