From d6d8de7f49b5fabd9c73e57b4766fc98a37a1d56 Mon Sep 17 00:00:00 2001 From: Jeremy Friesen Date: Fri, 2 Jun 2023 11:25:38 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=81=20Adding=20the=20PDF=20Splitting?= =?UTF-8?q?=20Logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds the logic to take an unploaded PDF and then split that PDF into constituent images. It does not yet account for how we handle the derivatives we generate from the images split off from the PDF. Related to: - https://github.com/scientist-softserv/iiif_print/issues/220 Co-authored-by: LaRita Robinson Co-authored-by: Shana Moore --- .../iiif_print/derivative_rodeo_service.rb | 13 ++- .../jobs/child_works_from_pdf_job.rb | 7 ++ .../split_pdfs/derivative_rodeo_splitter.rb | 79 +++++++++++++------ 3 files changed, 72 insertions(+), 27 deletions(-) diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb index 7afe3d85..c177a0d9 100644 --- a/app/services/iiif_print/derivative_rodeo_service.rb +++ b/app/services/iiif_print/derivative_rodeo_service.rb @@ -47,9 +47,18 @@ class DerivativeRodeoService # implementations for Adventist. Those are reasonable assumptions but time will tell how # reasonable. # + # By convention, this method is returning output_location of the SpaceStone::Serverless + # processing. We might know the original location that SpaceStone::Serverless processed, but + # that seems to be a tenuous assumption. + # + # In other words, where would SpaceStone, by convention, have written the original file and by + # convention written that original file's derivatives. + # + # TODO: We also need to account for PDF splitting + # # @param file_set [FileSet] # @return [String] - def self.derivative_rodeo_input_uri(file_set:) + def self.derivative_rodeo_input_uri(file_set:, filename: nil) return @derivative_rodeo_input_uri if defined?(@derivative_rodeo_input_uri) # TODO: URGENT For a child work (e.g. an image split off of a PDF) we will know that the file_set's @@ -70,7 +79,7 @@ def self.derivative_rodeo_input_uri(file_set:) # expendiency, I'm using it. See # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/services/add_file_to_file_set.rb#L49-L53 # TODO: Could we get away with filename that is passed in the create_derivatives process? - filename = Hydra::Works::DetermineOriginalName.call(file_set.original_file) + filename ||= Hydra::Works::DetermineOriginalName.call(file_set.original_file) # TODO: What kinds of exceptions might we raise if the location is not configured? Do we need # to "validate" it in another step. diff --git a/lib/iiif_print/jobs/child_works_from_pdf_job.rb b/lib/iiif_print/jobs/child_works_from_pdf_job.rb index 5eea0ad1..e61b17b2 100644 --- a/lib/iiif_print/jobs/child_works_from_pdf_job.rb +++ b/lib/iiif_print/jobs/child_works_from_pdf_job.rb @@ -97,6 +97,13 @@ def prepare_import_data(original_pdf_path, image_files, user) PendingRelationship.create!(child_title: child_title, parent_id: @parent_work.id, child_order: child_title) + + begin + # Clean up the temporary image path. + File.rm_f(image_path) if File.exist?(image_path) + rescue + # If we can't delete, let's move on. Maybe it was already cleaned-up. + end end end # rubocop:enable Metrics/MethodLength diff --git a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb index 9745cbff..3ba7d5df 100644 --- a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +++ b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb @@ -1,41 +1,70 @@ module IiifPrint module SplitPdfs + ## + # This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed + # images, or split a PDF if there are no preprocessed images. + # + # We have already attached the original file to the file_set. We want to convert that original + # file that's attached to a input_uri (e.g. "file://path/to/original-file" as in what we have + # written to Fedora as the PDF) + # + # @see .call class DerivativeRodeoSplitter ## - # This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed - # images, or split a PDF if there are no preprocessed images. - # - # TODO: override output extension from default "tiff" in Derivative Rodeo - # TODO: define output_location_template & preprocessed_location_template - - ## - # @param _path [String] unused here, kept for consistant splitter method signature + # @param path [String] the local file location # @param file_set [FileSet] file set containing a PDF file to split + # # @return [Array] paths to images split from each page of PDF file - def self.call(_path, file_set:) - new(file_set: file_set).split_files + def self.call(path, file_set:) + new(path, file_set: file_set).split_files end - def initialize(file_set:) - @path = IiifPrint::DerivativeRodeoService.derivative_rodeo_input_uri(file_set: file_set) - end + def initialize(path, file_set:, output_tmp_dir: Dir.tmpdir) + @input_uri = "file://#{path}" - def split_files - DerivativeRodeo::Generators::PdfSplitGenerator.new( - input_uris: @path, - output_location_template: template, - preprocessed_location_template: location - ).generated_files + # We are writing the images to a location that CarrierWave can upload. + # + # https://github.com/scientist-softserv/iiif_print/blob/b969541de1a0526305b54de37bf7cf100289f088/lib/iiif_print/jobs/child_works_from_pdf_job.rb#L108 + output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}') + + @output_location_template = "file://#{output_template_path}" + @preprocessed_location_template = IiifPrint::DerivativeRodeoService.derivative_rodeo_input_uri(file_set: file_set, filename: filename) end - private + ## + # This is where, in "Fedora" we have the original file. This is not the original file in the + # pre-processing location but instead the long-term location of the file in the application + # that mounts IIIF Print. + # + # @return [String] + attr_reader :input_uri - def template - 'who knows' - end + ## + # This is the location where we're going to write the derivatives that will "go into Fedora". + # + # @return [String] + attr_reader :output_location_template + + ## + # Where can we find, in the DerivativeRodeo's storage, what has already been done regarding + # derivative generation. + # + # For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3 + # bucket that we then use for IIIF Print. + # + # @return [String] + # + # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63 + attr_reader :preprocessed_location_template - def location - 'who knows' + ## + # @return [Array] the paths to each of the images split off from the PDF. + def split_files + DerivativeRodeo::Generators::PdfSplitGenerator.new( + input_uris: [@input_uri], + output_location_template: output_location_template, + preprocessed_location_template: preprocessed_location_template + ).generated_files.map { |location| location.file_path } end end end