diff --git a/lib/iiif_print/errors.rb b/lib/iiif_print/errors.rb index 70dec2e3..28727031 100644 --- a/lib/iiif_print/errors.rb +++ b/lib/iiif_print/errors.rb @@ -6,4 +6,7 @@ class IiifPrintError < StandardError # Data transformation or read-error: class DataError < IiifPrintError end + + class MissingFileError < IiifPrintError + end end diff --git a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb index d5375c9d..e019873d 100644 --- a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +++ b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb @@ -20,6 +20,9 @@ def self.call(filename, file_set:) end def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir) + @filename = filename + @file_set = file_set + @input_uri = "file://#{filename}" # We are writing the images to a local location that CarrierWave can upload. This is a @@ -28,9 +31,10 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir) output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}') @output_location_template = "file://#{output_template_path}" - @preprocessed_location_template = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename) end + attr_reader :filename, :file_set + ## # This is where, in "Fedora" we have the original file. This is not the original file in the # pre-processing location but instead the long-term location of the file in the application @@ -48,8 +52,11 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir) attr_reader :output_location_template ## - # Where can we find, in the DerivativeRodeo's storage, what has already been done regarding - # derivative generation. + # Where can we find the file that represents the pre-processing template. In this case, the + # original PDF file. + # + # The logic handles a case where SpaceStone successfully fetched the file to then perform + # processing. # # For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3 # bucket that we then use for IIIF Print. @@ -61,19 +68,63 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir) # @return [String] # # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63 - attr_reader :preprocessed_location_template + # rubocop:disable Metrics/AbcSize + # rubocop:disable Metrics/MethodLength + def preprocessed_location_template + return @preprocessed_location_template if defined?(@preprocessed_location_template) + + derivative_rodeo_candidate = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename) + + @preprocessed_location_template = + if rodeo_conformant_uri_exists?(derivative_rodeo_candidate) + Rails.logger.debug("#{self.class}##{__method__} found existing file at location #{derivative_rodeo_candidate}. High five partner!") + derivative_rodeo_candidate + elsif file_set.import_url + message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist. " \ + "Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}" + Rails.logger.warn(message) + # If the DerivativeRodeo doesn't know about the adapter for the import, this will raise + # an error. + # + # Since the file was not pre-processed, we're likely now going to be downloading that + # file and running all of the derivatives locally. + if rodeo_conformant_uri_exists?(file_set.import_url) + message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \ + "Perhaps there was a problem in SpaceStone downloading the file? Things should be okay." + Rails.logger.info(message) + file_set.import_url + else + message = "#{self.class}##{__method__} expected #{file_set.import_url.inspect} as specified " \ + "by #{file_set.class}#import_url to exist at remote location, but it did not." + raise MissingFileError, message + end + else + message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \ + "nor a remote_url for #{file_set.class} ID=#{file_set.id}. Returning `nil' as we have no possible preprocess. " \ + "Maybe the input_uri #{input_uri.inspect} will be adequate." + Rails.logger.warn(message) + nil + end + end + # rubocop:enable Metrics/AbcSize + # rubocop:enable Metrics/MethodLength + + def rodeo_conformant_uri_exists?(uri) + DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist? + end + private :rodeo_conformant_uri_exists? ## # @return [Array] the paths to each of the images split off from the PDF. def split_files DerivativeRodeo::Generators::PdfSplitGenerator.new( - input_uris: [@input_uri], + input_uris: [input_uri], output_location_template: output_location_template, preprocessed_location_template: preprocessed_location_template ).generated_files.map(&:file_path) rescue => e message = "#{self.class}##{__method__} encountered `#{e.class}' “#{e}” for " \ - "input_uri: #{@input_uri.inspect}, " \ + "input_uri: #{input_uri.inspect}, " \ "output_location_template: #{output_location_template.inspect}, and" \ "preprocessed_location_template: #{preprocessed_location_template.inspect}." exception = RuntimeError.new(message) diff --git a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb index ac688901..5822bf5b 100644 --- a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +++ b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb @@ -3,9 +3,14 @@ require 'spec_helper' RSpec.describe IiifPrint::SplitPdfs::DerivativeRodeoSplitter do - let(:path) { __FILE__ } + let(:filename) { __FILE__ } let(:work) { double(MyWork, aark_id: '12345') } let(:file_set) { FileSet.new.tap { |fs| fs.save!(validate: false) } } + let(:location_stub) { double(DerivativeRodeo::StorageLocations::BaseLocation, exist?: true) } + + before do + allow(DerivativeRodeo::StorageLocations::BaseLocation).to receive(:from_uri).and_return(location_stub) + end describe 'class' do subject { described_class } @@ -13,22 +18,60 @@ it { is_expected.to respond_to(:call) } end - describe "instance" do - subject { described_class.new(path, file_set: file_set) } - let(:generator) { double(DerivativeRodeo::Generators::PdfSplitGenerator, generated_files: []) } + subject(:instance) { described_class.new(filename, file_set: file_set) } + let(:generator) { double(DerivativeRodeo::Generators::PdfSplitGenerator, generated_files: []) } + + before do + allow(file_set).to receive(:parent).and_return(work) + # TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but + # this part is only one piece of the over all integration. + allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) + end + + it { is_expected.to respond_to :split_files } + + it 'uses the rodeo to split' do + expect(DerivativeRodeo::Generators::PdfSplitGenerator).to receive(:new).and_return(generator) + described_class.call(filename, file_set: file_set) + end + + describe '#preprocessed_location_template' do + let(:derivative_rodeo_preprocessed_file) { IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename) } + let(:import_url) { "https://somewhere.com/that/exists.pdf" } + subject { instance.preprocessed_location_template } + + context 'when the s3 file exists in the rodeo' do + it 'is that file' do + is_expected.to eq(derivative_rodeo_preprocessed_file) + end + end + + context 'when the s3 file does not exist in the rodeo and the file sets import url exists' do + it 'is the import_url' do + file_set.import_url = import_url + expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false) + expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(true) + expect(subject).to eq(file_set.import_url) + end + end + + context 'when the s3 file does not exist and the given import url does NOT exist' do + it 'will raise a IiifPrint::MissingFileError' do + file_set.import_url = import_url + expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false) + expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(false) - before do - allow(file_set).to receive(:parent).and_return(work) - # TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but - # this part is only one piece of the over all integration. - allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) + expect { subject }.to raise_error(IiifPrint::MissingFileError) + end end - it { is_expected.to respond_to :split_files } + context "when the s3 file does not exist and we don't have a remote_url" do + it 'will use the given filename' do + file_set.import_url = nil + expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false) - it 'uses the rodeo to split' do - expect(DerivativeRodeo::Generators::PdfSplitGenerator).to receive(:new).and_return(generator) - described_class.call(path, file_set: file_set) + expect(subject).to eq(nil) + end end end end