From f7fcea89ff1a8f4c24a223b394e1f40e385af2e5 Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Wed, 18 Dec 2024 11:24:10 -0600 Subject: [PATCH] Upload PDFs to Illiad SFTP server --- app/controllers/ocr_requests_controller.rb | 8 +---- app/jobs/pdf_ocr_job.rb | 41 +++++++++++++--------- config/config.yml | 10 +++--- spec/jobs/pdf_ocr_job_spec.rb | 29 ++++++++------- 4 files changed, 45 insertions(+), 43 deletions(-) diff --git a/app/controllers/ocr_requests_controller.rb b/app/controllers/ocr_requests_controller.rb index 5d03e5a15d..0901656e04 100644 --- a/app/controllers/ocr_requests_controller.rb +++ b/app/controllers/ocr_requests_controller.rb @@ -21,7 +21,7 @@ def upload_file authorize! :update, OcrRequest.new @ocr_request = OcrRequest.new(ocr_request_params) if @ocr_request.save - PdfOcrJob.perform_later(resource: @ocr_request, out_path: ocr_out_file) + PdfOcrJob.perform_later(resource: @ocr_request) render status: :ok, json: { message: "uploaded" } else render status: :unprocessable_entity, json: @ocr_request.errors @@ -30,12 +30,6 @@ def upload_file private - def ocr_out_file - out_dir = Figgy.config["ocr_out_path"] - FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir) - File.join(out_dir, @ocr_request.filename) - end - def ocr_request_params { filename: params["file"].original_filename, diff --git a/app/jobs/pdf_ocr_job.rb b/app/jobs/pdf_ocr_job.rb index 87e1b28086..a0cfe31d21 100644 --- a/app/jobs/pdf_ocr_job.rb +++ b/app/jobs/pdf_ocr_job.rb @@ -23,15 +23,16 @@ def pdf_attached? def run_pdf_ocr blob.open do |file| - _stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, out_path.to_s) - return true if status.success? - update_state(state: "Error", message: "PDF OCR job failed: #{error_str}") - - # Copy orginal file to destination without OCR - FileUtils.cp file.path, out_path.to_s + _stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, temporary_file.path.to_s) + if status.success? + transfer_file(temporary_file.path.to_s) + true + else + update_state(state: "Error", message: "PDF OCR job failed: #{error_str}") + transfer_file(file.path) + false + end end - - false end def update_state(state:, message: nil) @@ -40,15 +41,23 @@ def update_state(state:, message: nil) resource.save end - def out_path - File.join(ocr_out_dir, resource.filename) + def temporary_file + @temporary_file ||= Tempfile.new end - def ocr_out_dir - @ocr_out_dir ||= begin - path = Figgy.config["ocr_out_path"] - FileUtils.mkdir_p(path) unless File.directory?(path) - path - end + def transfer_file(path) + host = Figgy.config["illiad_sftp_host"] + user = Figgy.config["illiad_sftp_user"] + pass = Figgy.config["illiad_sftp_pass"] + port = Figgy.config["illiad_sftp_port"] + out_path = File.join(Figgy.config["illiad_sftp_path"], "pdf", resource.filename) + + begin + sftp = Net::SFTP.start(host, user, { password: pass, port: port }) + sftp.upload!(path, out_path) + ensure + sftp.close_channel + sftp.session.close + end end end diff --git a/config/config.yml b/config/config.yml index 8e4e7a249d..362de4ed6e 100644 --- a/config/config.yml +++ b/config/config.yml @@ -8,9 +8,11 @@ defaults: &defaults plausible_api_key: <%= ENV["PLAUSIBLE_API_KEY"] %> cdl_in_path: <%= ENV["CDL_IN_PATH"] %> ocr_in_path: <%= ENV["OCR_IN_PATH"] %> - ocr_out_path: <%= ENV["OCR_OUT_PATH"] %> - illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] %> - illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] %> + illiad_sftp_host: <%= ENV["ILLIAD_SFTP_HOST"] || "sftp.example.com" %> + illiad_sftp_port: <%= ENV["ILLIAD_SFTP_PORT"] || "2222" %> + illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] || "user" %> + illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] || "password" %> + illiad_sftp_path: <%= ENV["ILLIAD_SFTP_BASE_PATH"] || "/illiad" %> pyramidals_bucket: "iiif-image-staging" pyramidals_region: "us-east-1" aws_access_key_id: <%= ENV["FIGGY_AWS_ACCESS_KEY_ID"] %> @@ -239,7 +241,6 @@ development: <<: *defaults cdl_in_path: <%= Rails.root.join("tmp", "cdl_in") %> ocr_in_path: <%= Rails.root.join("tmp", "ocr_in") %> - ocr_out_path: <%= Rails.root.join("tmp", "ocr_out") %> repository_path: <%= Rails.root.join("tmp", "more_files") %> pyramidals_bucket: <%= ENV["FIGGY_PYRAMIDALS_BUCKET"] %> cloud_geo_bucket: "figgy-geo-staging" @@ -256,7 +257,6 @@ test: plausible_api_key: "plausible_api_key" cdl_in_path: <%= Rails.root.join("tmp", "test_cdl_in") %> ocr_in_path: <%= Rails.root.join("tmp", "test_ocr_in#{ENV["TEST_ENV_NUMBER"]}") %> - ocr_out_path: <%= Rails.root.join("tmp", "test_ocr_out#{ENV["TEST_ENV_NUMBER"]}") %> repository_path: <%= Rails.root.join("tmp", "test_files#{ENV["TEST_ENV_NUMBER"]}") %> derivative_path: <%= Rails.root.join("tmp", "test_derivatives#{ENV["TEST_ENV_NUMBER"]}") %> pyramidal_derivative_path: <%= Rails.root.join("tmp", "test_pyramidal_derivatives#{ENV["TEST_ENV_NUMBER"]}") %> diff --git a/spec/jobs/pdf_ocr_job_spec.rb b/spec/jobs/pdf_ocr_job_spec.rb index e816d291c2..e95cf2757a 100644 --- a/spec/jobs/pdf_ocr_job_spec.rb +++ b/spec/jobs/pdf_ocr_job_spec.rb @@ -3,27 +3,26 @@ RSpec.describe PdfOcrJob do describe "#perform" do - let(:out_dir) { Figgy.config["ocr_out_path"] } - let(:out_path) { File.join(out_dir, "sample.pdf") } + let(:ssh_session) { instance_double(Net::SSH::Connection::Session) } + let(:sftp_session) { instance_double(Net::SFTP::Session) } let(:resource) { FactoryBot.create(:ocr_request, file: fixture_path) } before do - # Create tmp ocr out directory - FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir) - end - - after do - # Cleanup PDFs - File.delete(out_path) if File.exist?(out_path) + allow(Net::SFTP).to receive(:start).and_return(sftp_session) + allow(sftp_session).to receive(:upload!) + allow(sftp_session).to receive(:close_channel) + allow(sftp_session).to receive(:session).and_return(ssh_session) + allow(ssh_session).to receive(:close) end context "with a valid PDF" do let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "sample.pdf") } - it "creates on OCRed PDF in an output directory and deletes the attached PDF" do - expect { described_class.perform_now(resource: resource) } - .to change { File.exist?(out_path) } - .from(false).to(true) + it "creates on OCRed PDF, uploads the file to the Illiad SFTP server, and deletes the attached PDF" do + described_class.perform_now(resource: resource) + expect(sftp_session).to have_received(:upload!) + expect(sftp_session).to have_received(:close_channel) + expect(ssh_session).to have_received(:close) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Complete" expect(ocr_request.pdf.attached?).to be false @@ -33,12 +32,12 @@ context "with a PDF that can't be OCRed" do let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "bad.pdf") } - it "saves error on the ocr request resource and copies original file to out path" do + it "saves error on the ocr request resource and uploads the original file to the Illiad SFTP server" do described_class.perform_now(resource: resource) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Error" expect(ocr_request.note).to include "PDF OCR job failed" - expect(File.exist?(out_path)).to be true + expect(sftp_session).to have_received(:upload!) expect(ocr_request.pdf.attached?).to be false end end