Skip to content

Commit

Permalink
Merge pull request #283 from scientist-softserv/i282-gracefully-handl…
Browse files Browse the repository at this point in the history
…ing-of-copy-failure

🎁 Add graceful fallback of preprocessing
  • Loading branch information
jeremyf authored Nov 6, 2023
2 parents 652cc43 + 4b4165f commit 5fb1531
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 19 deletions.
3 changes: 3 additions & 0 deletions lib/iiif_print/errors.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@ class IiifPrintError < StandardError
# Data transformation or read-error:
class DataError < IiifPrintError
end

class MissingFileError < IiifPrintError
end
end
63 changes: 57 additions & 6 deletions lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def self.call(filename, file_set:)
end

def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
@filename = filename
@file_set = file_set

@input_uri = "file://#{filename}"

# We are writing the images to a local location that CarrierWave can upload. This is a
Expand All @@ -28,9 +31,10 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}')

@output_location_template = "file://#{output_template_path}"
@preprocessed_location_template = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename)
end

attr_reader :filename, :file_set

##
# This is where, in "Fedora" we have the original file. This is not the original file in the
# pre-processing location but instead the long-term location of the file in the application
Expand All @@ -48,8 +52,11 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
attr_reader :output_location_template

##
# Where can we find, in the DerivativeRodeo's storage, what has already been done regarding
# derivative generation.
# Where can we find the file that represents the pre-processing template. In this case, the
# original PDF file.
#
# The logic handles a case where SpaceStone successfully fetched the file to then perform
# processing.
#
# For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3
# bucket that we then use for IIIF Print.
Expand All @@ -61,19 +68,63 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
# @return [String]
#
# @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63
attr_reader :preprocessed_location_template
# rubocop:disable Metrics/AbcSize
# rubocop:disable Metrics/MethodLength
def preprocessed_location_template
return @preprocessed_location_template if defined?(@preprocessed_location_template)

derivative_rodeo_candidate = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename)

@preprocessed_location_template =
if rodeo_conformant_uri_exists?(derivative_rodeo_candidate)
Rails.logger.debug("#{self.class}##{__method__} found existing file at location #{derivative_rodeo_candidate}. High five partner!")
derivative_rodeo_candidate
elsif file_set.import_url
message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist. " \
"Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}"
Rails.logger.warn(message)
# If the DerivativeRodeo doesn't know about the adapter for the import, this will raise
# an error.
#
# Since the file was not pre-processed, we're likely now going to be downloading that
# file and running all of the derivatives locally.
if rodeo_conformant_uri_exists?(file_set.import_url)
message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \
"Perhaps there was a problem in SpaceStone downloading the file? Things should be okay."
Rails.logger.info(message)
file_set.import_url
else
message = "#{self.class}##{__method__} expected #{file_set.import_url.inspect} as specified " \
"by #{file_set.class}#import_url to exist at remote location, but it did not."
raise MissingFileError, message
end
else
message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \
"nor a remote_url for #{file_set.class} ID=#{file_set.id}. Returning `nil' as we have no possible preprocess. " \
"Maybe the input_uri #{input_uri.inspect} will be adequate."
Rails.logger.warn(message)
nil
end
end
# rubocop:enable Metrics/AbcSize
# rubocop:enable Metrics/MethodLength

def rodeo_conformant_uri_exists?(uri)
DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist?
end
private :rodeo_conformant_uri_exists?

##
# @return [Array<Strings>] the paths to each of the images split off from the PDF.
def split_files
DerivativeRodeo::Generators::PdfSplitGenerator.new(
input_uris: [@input_uri],
input_uris: [input_uri],
output_location_template: output_location_template,
preprocessed_location_template: preprocessed_location_template
).generated_files.map(&:file_path)
rescue => e
message = "#{self.class}##{__method__} encountered `#{e.class}' “#{e}” for " \
"input_uri: #{@input_uri.inspect}, " \
"input_uri: #{input_uri.inspect}, " \
"output_location_template: #{output_location_template.inspect}, and" \
"preprocessed_location_template: #{preprocessed_location_template.inspect}."
exception = RuntimeError.new(message)
Expand Down
69 changes: 56 additions & 13 deletions spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,75 @@
require 'spec_helper'

RSpec.describe IiifPrint::SplitPdfs::DerivativeRodeoSplitter do
let(:path) { __FILE__ }
let(:filename) { __FILE__ }
let(:work) { double(MyWork, aark_id: '12345') }
let(:file_set) { FileSet.new.tap { |fs| fs.save!(validate: false) } }
let(:location_stub) { double(DerivativeRodeo::StorageLocations::BaseLocation, exist?: true) }

before do
allow(DerivativeRodeo::StorageLocations::BaseLocation).to receive(:from_uri).and_return(location_stub)
end

describe 'class' do
subject { described_class }

it { is_expected.to respond_to(:call) }
end

describe "instance" do
subject { described_class.new(path, file_set: file_set) }
let(:generator) { double(DerivativeRodeo::Generators::PdfSplitGenerator, generated_files: []) }
subject(:instance) { described_class.new(filename, file_set: file_set) }
let(:generator) { double(DerivativeRodeo::Generators::PdfSplitGenerator, generated_files: []) }

before do
allow(file_set).to receive(:parent).and_return(work)
# TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but
# this part is only one piece of the over all integration.
allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__))
end

it { is_expected.to respond_to :split_files }

it 'uses the rodeo to split' do
expect(DerivativeRodeo::Generators::PdfSplitGenerator).to receive(:new).and_return(generator)
described_class.call(filename, file_set: file_set)
end

describe '#preprocessed_location_template' do
let(:derivative_rodeo_preprocessed_file) { IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename) }
let(:import_url) { "https://somewhere.com/that/exists.pdf" }
subject { instance.preprocessed_location_template }

context 'when the s3 file exists in the rodeo' do
it 'is that file' do
is_expected.to eq(derivative_rodeo_preprocessed_file)
end
end

context 'when the s3 file does not exist in the rodeo and the file sets import url exists' do
it 'is the import_url' do
file_set.import_url = import_url
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false)
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(true)
expect(subject).to eq(file_set.import_url)
end
end

context 'when the s3 file does not exist and the given import url does NOT exist' do
it 'will raise a IiifPrint::MissingFileError' do
file_set.import_url = import_url
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false)
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(false)

before do
allow(file_set).to receive(:parent).and_return(work)
# TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but
# this part is only one piece of the over all integration.
allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__))
expect { subject }.to raise_error(IiifPrint::MissingFileError)
end
end

it { is_expected.to respond_to :split_files }
context "when the s3 file does not exist and we don't have a remote_url" do
it 'will use the given filename' do
file_set.import_url = nil
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false)

it 'uses the rodeo to split' do
expect(DerivativeRodeo::Generators::PdfSplitGenerator).to receive(:new).and_return(generator)
described_class.call(path, file_set: file_set)
expect(subject).to eq(nil)
end
end
end
end

0 comments on commit 5fb1531

Please sign in to comment.