diff --git a/Gemfile b/Gemfile index 0a03b4512..c08e5b813 100644 --- a/Gemfile +++ b/Gemfile @@ -67,6 +67,7 @@ gem "mime-types" gem "mini_magick" gem "modernizr-rails" # Pin because capistrano raises an error at >= 7.2 +gem "net-sftp" gem "net-ssh", "~> 7.1.0" gem "normalize-rails" gem "oai" diff --git a/Gemfile.lock b/Gemfile.lock index 1b1208444..e632e23d6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1326,6 +1326,7 @@ DEPENDENCIES modernizr-rails net-imap net-pop + net-sftp net-smtp net-ssh (~> 7.1.0) normalize-rails diff --git a/app/controllers/ocr_requests_controller.rb b/app/controllers/ocr_requests_controller.rb index 5d03e5a15..0901656e0 100644 --- a/app/controllers/ocr_requests_controller.rb +++ b/app/controllers/ocr_requests_controller.rb @@ -21,7 +21,7 @@ def upload_file authorize! :update, OcrRequest.new @ocr_request = OcrRequest.new(ocr_request_params) if @ocr_request.save - PdfOcrJob.perform_later(resource: @ocr_request, out_path: ocr_out_file) + PdfOcrJob.perform_later(resource: @ocr_request) render status: :ok, json: { message: "uploaded" } else render status: :unprocessable_entity, json: @ocr_request.errors @@ -30,12 +30,6 @@ def upload_file private - def ocr_out_file - out_dir = Figgy.config["ocr_out_path"] - FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir) - File.join(out_dir, @ocr_request.filename) - end - def ocr_request_params { filename: params["file"].original_filename, diff --git a/app/jobs/create_ocr_request_job.rb b/app/jobs/create_ocr_request_job.rb index 653354eaf..0f3fe640d 100644 --- a/app/jobs/create_ocr_request_job.rb +++ b/app/jobs/create_ocr_request_job.rb @@ -10,15 +10,7 @@ def perform(file_path:) ocr_request = OcrRequest.new(filename: filename, state: "Enqueued") ocr_request.save! ocr_request.pdf.attach(io: File.open(file_path), filename: filename, content_type: "application/pdf") - out_path = File.join(ocr_out_dir, filename) - PdfOcrJob.perform_later(resource: ocr_request, out_path: out_path) + PdfOcrJob.perform_later(resource: ocr_request) File.delete(file_path) end - - def ocr_out_dir - out_dir = Figgy.config["ocr_out_path"] - FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir) - - out_dir - end end diff --git a/app/jobs/pdf_ocr_job.rb b/app/jobs/pdf_ocr_job.rb index db15fd6bb..5cc47a7a9 100644 --- a/app/jobs/pdf_ocr_job.rb +++ b/app/jobs/pdf_ocr_job.rb @@ -2,12 +2,11 @@ class PdfOcrJob < ApplicationJob queue_as :high - attr_reader :blob, :out_path, :resource + attr_reader :blob, :resource - def perform(resource:, out_path:) + def perform(resource:) logger.info("PDF OCR job initiated for: #{resource.filename}") @resource = resource - @out_path = out_path @blob = resource.pdf # Required for ActiveStorage blob to tempfile method. update_state(state: "Processing") return unless pdf_attached? @@ -24,15 +23,16 @@ def pdf_attached? def run_pdf_ocr blob.open do |file| - _stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, out_path.to_s) - return true if status.success? - update_state(state: "Error", message: "PDF OCR job failed: #{error_str}") - - # Copy orginal file to destination without OCR - FileUtils.cp file.path, out_path.to_s + _stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, temporary_file.path.to_s) + if status.success? + transfer_file(temporary_file.path.to_s) + true + else + update_state(state: "Error", message: "PDF OCR job failed: #{error_str}") + transfer_file(file.path) + false + end end - - false end def update_state(state:, message: nil) @@ -40,4 +40,20 @@ def update_state(state:, message: nil) resource.note = message if message resource.save end + + def temporary_file + @temporary_file ||= Tempfile.new + end + + def transfer_file(path) + host = Figgy.config["illiad_sftp_host"] + user = Figgy.config["illiad_sftp_user"] + pass = Figgy.config["illiad_sftp_pass"] + port = Figgy.config["illiad_sftp_port"] + out_path = File.join(Figgy.config["illiad_sftp_path"], "pdf", resource.filename) + + Net::SFTP.start(host, user, { password: pass, port: port }) do |sftp| + sftp.upload!(path, out_path) + end + end end diff --git a/config/config.yml b/config/config.yml index 7ebdda198..67c2841ab 100644 --- a/config/config.yml +++ b/config/config.yml @@ -7,8 +7,12 @@ defaults: &defaults archivespace_password: <%= ENV["ASPACE_PASSWORD"] %> plausible_api_key: <%= ENV["PLAUSIBLE_API_KEY"] %> cdl_in_path: <%= ENV["CDL_IN_PATH"] %> - ocr_in_path: <%= ENV["OCR_IN_PATH"] %> - ocr_out_path: <%= ENV["OCR_OUT_PATH"] %> + ocr_in_path: <%= ENV["OCR_ILLIAD_IN_PATH"] %> + illiad_sftp_host: <%= ENV["ILLIAD_SFTP_HOST"] || "sftp.example.com" %> + illiad_sftp_port: <%= ENV["ILLIAD_SFTP_PORT"] || "2222" %> + illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] || "user" %> + illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] || "password" %> + illiad_sftp_path: <%= ENV["ILLIAD_SFTP_BASE_PATH"] || "/illiad" %> pyramidals_bucket: "iiif-image-staging" pyramidals_region: "us-east-1" aws_access_key_id: <%= ENV["FIGGY_AWS_ACCESS_KEY_ID"] %> @@ -237,7 +241,6 @@ development: <<: *defaults cdl_in_path: <%= Rails.root.join("tmp", "cdl_in") %> ocr_in_path: <%= Rails.root.join("tmp", "ocr_in") %> - ocr_out_path: <%= Rails.root.join("tmp", "ocr_out") %> repository_path: <%= Rails.root.join("tmp", "more_files") %> pyramidals_bucket: <%= ENV["FIGGY_PYRAMIDALS_BUCKET"] %> cloud_geo_bucket: "figgy-geo-staging" @@ -254,7 +257,6 @@ test: plausible_api_key: "plausible_api_key" cdl_in_path: <%= Rails.root.join("tmp", "test_cdl_in") %> ocr_in_path: <%= Rails.root.join("tmp", "test_ocr_in#{ENV["TEST_ENV_NUMBER"]}") %> - ocr_out_path: <%= Rails.root.join("tmp", "test_ocr_out#{ENV["TEST_ENV_NUMBER"]}") %> repository_path: <%= Rails.root.join("tmp", "test_files#{ENV["TEST_ENV_NUMBER"]}") %> derivative_path: <%= Rails.root.join("tmp", "test_derivatives#{ENV["TEST_ENV_NUMBER"]}") %> pyramidal_derivative_path: <%= Rails.root.join("tmp", "test_pyramidal_derivatives#{ENV["TEST_ENV_NUMBER"]}") %> diff --git a/spec/jobs/pdf_ocr_job_spec.rb b/spec/jobs/pdf_ocr_job_spec.rb index 3c089b582..630d58fcb 100644 --- a/spec/jobs/pdf_ocr_job_spec.rb +++ b/spec/jobs/pdf_ocr_job_spec.rb @@ -3,27 +3,20 @@ RSpec.describe PdfOcrJob do describe "#perform" do - let(:out_dir) { Figgy.config["ocr_out_path"] } - let(:out_path) { File.join(out_dir, "ocr-sample.pdf") } + let(:sftp_session) { instance_double(Net::SFTP::Session) } let(:resource) { FactoryBot.create(:ocr_request, file: fixture_path) } before do - # Create tmp ocr out directory - FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir) - end - - after do - # Cleanup PDFs - File.delete(out_path) if File.exist?(out_path) + allow(Net::SFTP).to receive(:start).and_yield(sftp_session) + allow(sftp_session).to receive(:upload!) end context "with a valid PDF" do let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "sample.pdf") } - it "creates on OCRed PDF in an output directory and deletes the attached PDF" do - expect { described_class.perform_now(resource: resource, out_path: out_path) } - .to change { File.exist?(out_path) } - .from(false).to(true) + it "creates on OCRed PDF, uploads the file to the Illiad SFTP server, and deletes the attached PDF" do + described_class.perform_now(resource: resource) + expect(sftp_session).to have_received(:upload!).with(/.*/, /pdf\/sample\.pdf/) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Complete" expect(ocr_request.pdf.attached?).to be false @@ -33,12 +26,12 @@ context "with a PDF that can't be OCRed" do let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "bad.pdf") } - it "saves error on the ocr request resource and copies original file to out path" do - described_class.perform_now(resource: resource, out_path: out_path) + it "saves error on the ocr request resource and uploads the original file to the Illiad SFTP server" do + described_class.perform_now(resource: resource) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Error" expect(ocr_request.note).to include "PDF OCR job failed" - expect(File.exist?(out_path)).to be true + expect(sftp_session).to have_received(:upload!) expect(ocr_request.pdf.attached?).to be false end end @@ -47,7 +40,7 @@ let(:resource) { FactoryBot.create(:ocr_request) } it "adds an error message to the ocr request resource" do - described_class.perform_now(resource: resource, out_path: out_path) + described_class.perform_now(resource: resource) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Error" expect(ocr_request.note).to include "Resource has no attached PDF"