From c23cdaa04c17e1ee6d76920a3ff61bae5119e988 Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Mon, 16 Dec 2024 14:55:06 -0600 Subject: [PATCH 1/6] Add net-sftp gem --- Gemfile | 1 + Gemfile.lock | 1 + 2 files changed, 2 insertions(+) diff --git a/Gemfile b/Gemfile index 0a03b4512..924daee3f 100644 --- a/Gemfile +++ b/Gemfile @@ -68,6 +68,7 @@ gem "mini_magick" gem "modernizr-rails" # Pin because capistrano raises an error at >= 7.2 gem "net-ssh", "~> 7.1.0" +gem "net-sftp" gem "normalize-rails" gem "oai" gem "omniauth", "1.9.2" diff --git a/Gemfile.lock b/Gemfile.lock index 1b1208444..e632e23d6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1326,6 +1326,7 @@ DEPENDENCIES modernizr-rails net-imap net-pop + net-sftp net-smtp net-ssh (~> 7.1.0) normalize-rails From 9fa642712ef23c5b0c491cedfde51ce27c9a5248 Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Mon, 16 Dec 2024 14:55:21 -0600 Subject: [PATCH 2/6] Add configuration for sftp --- config/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/config.yml b/config/config.yml index 7ebdda198..8e4e7a249 100644 --- a/config/config.yml +++ b/config/config.yml @@ -9,6 +9,8 @@ defaults: &defaults cdl_in_path: <%= ENV["CDL_IN_PATH"] %> ocr_in_path: <%= ENV["OCR_IN_PATH"] %> ocr_out_path: <%= ENV["OCR_OUT_PATH"] %> + illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] %> + illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] %> pyramidals_bucket: "iiif-image-staging" pyramidals_region: "us-east-1" aws_access_key_id: <%= ENV["FIGGY_AWS_ACCESS_KEY_ID"] %> From 6b6133d293e0da4ac00a886ea401fe7a2469815f Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Wed, 18 Dec 2024 10:29:11 -0600 Subject: [PATCH 3/6] Refactor ocr jobs --- Gemfile | 2 +- app/jobs/create_ocr_request_job.rb | 10 +--------- app/jobs/pdf_ocr_job.rb | 17 ++++++++++++++--- spec/jobs/pdf_ocr_job_spec.rb | 8 ++++---- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/Gemfile b/Gemfile index 924daee3f..c08e5b813 100644 --- a/Gemfile +++ b/Gemfile @@ -67,8 +67,8 @@ gem "mime-types" gem "mini_magick" gem "modernizr-rails" # Pin because capistrano raises an error at >= 7.2 -gem "net-ssh", "~> 7.1.0" gem "net-sftp" +gem "net-ssh", "~> 7.1.0" gem "normalize-rails" gem "oai" gem "omniauth", "1.9.2" diff --git a/app/jobs/create_ocr_request_job.rb b/app/jobs/create_ocr_request_job.rb index 653354eaf..0f3fe640d 100644 --- a/app/jobs/create_ocr_request_job.rb +++ b/app/jobs/create_ocr_request_job.rb @@ -10,15 +10,7 @@ def perform(file_path:) ocr_request = OcrRequest.new(filename: filename, state: "Enqueued") ocr_request.save! ocr_request.pdf.attach(io: File.open(file_path), filename: filename, content_type: "application/pdf") - out_path = File.join(ocr_out_dir, filename) - PdfOcrJob.perform_later(resource: ocr_request, out_path: out_path) + PdfOcrJob.perform_later(resource: ocr_request) File.delete(file_path) end - - def ocr_out_dir - out_dir = Figgy.config["ocr_out_path"] - FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir) - - out_dir - end end diff --git a/app/jobs/pdf_ocr_job.rb b/app/jobs/pdf_ocr_job.rb index db15fd6bb..87e1b2808 100644 --- a/app/jobs/pdf_ocr_job.rb +++ b/app/jobs/pdf_ocr_job.rb @@ -2,12 +2,11 @@ class PdfOcrJob < ApplicationJob queue_as :high - attr_reader :blob, :out_path, :resource + attr_reader :blob, :resource - def perform(resource:, out_path:) + def perform(resource:) logger.info("PDF OCR job initiated for: #{resource.filename}") @resource = resource - @out_path = out_path @blob = resource.pdf # Required for ActiveStorage blob to tempfile method. update_state(state: "Processing") return unless pdf_attached? @@ -40,4 +39,16 @@ def update_state(state:, message: nil) resource.note = message if message resource.save end + + def out_path + File.join(ocr_out_dir, resource.filename) + end + + def ocr_out_dir + @ocr_out_dir ||= begin + path = Figgy.config["ocr_out_path"] + FileUtils.mkdir_p(path) unless File.directory?(path) + path + end + end end diff --git a/spec/jobs/pdf_ocr_job_spec.rb b/spec/jobs/pdf_ocr_job_spec.rb index 3c089b582..e816d291c 100644 --- a/spec/jobs/pdf_ocr_job_spec.rb +++ b/spec/jobs/pdf_ocr_job_spec.rb @@ -4,7 +4,7 @@ RSpec.describe PdfOcrJob do describe "#perform" do let(:out_dir) { Figgy.config["ocr_out_path"] } - let(:out_path) { File.join(out_dir, "ocr-sample.pdf") } + let(:out_path) { File.join(out_dir, "sample.pdf") } let(:resource) { FactoryBot.create(:ocr_request, file: fixture_path) } before do @@ -21,7 +21,7 @@ let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "sample.pdf") } it "creates on OCRed PDF in an output directory and deletes the attached PDF" do - expect { described_class.perform_now(resource: resource, out_path: out_path) } + expect { described_class.perform_now(resource: resource) } .to change { File.exist?(out_path) } .from(false).to(true) ocr_request = OcrRequest.all.first @@ -34,7 +34,7 @@ let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "bad.pdf") } it "saves error on the ocr request resource and copies original file to out path" do - described_class.perform_now(resource: resource, out_path: out_path) + described_class.perform_now(resource: resource) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Error" expect(ocr_request.note).to include "PDF OCR job failed" @@ -47,7 +47,7 @@ let(:resource) { FactoryBot.create(:ocr_request) } it "adds an error message to the ocr request resource" do - described_class.perform_now(resource: resource, out_path: out_path) + described_class.perform_now(resource: resource) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Error" expect(ocr_request.note).to include "Resource has no attached PDF" From cb6c0471d3f4e7e75a3f6ee9a7b50b17cb0c5e36 Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Wed, 18 Dec 2024 11:24:10 -0600 Subject: [PATCH 4/6] Upload PDFs to Illiad SFTP server --- app/controllers/ocr_requests_controller.rb | 8 +---- app/jobs/pdf_ocr_job.rb | 41 +++++++++++++--------- config/config.yml | 10 +++--- spec/jobs/pdf_ocr_job_spec.rb | 29 ++++++++------- 4 files changed, 45 insertions(+), 43 deletions(-) diff --git a/app/controllers/ocr_requests_controller.rb b/app/controllers/ocr_requests_controller.rb index 5d03e5a15..0901656e0 100644 --- a/app/controllers/ocr_requests_controller.rb +++ b/app/controllers/ocr_requests_controller.rb @@ -21,7 +21,7 @@ def upload_file authorize! :update, OcrRequest.new @ocr_request = OcrRequest.new(ocr_request_params) if @ocr_request.save - PdfOcrJob.perform_later(resource: @ocr_request, out_path: ocr_out_file) + PdfOcrJob.perform_later(resource: @ocr_request) render status: :ok, json: { message: "uploaded" } else render status: :unprocessable_entity, json: @ocr_request.errors @@ -30,12 +30,6 @@ def upload_file private - def ocr_out_file - out_dir = Figgy.config["ocr_out_path"] - FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir) - File.join(out_dir, @ocr_request.filename) - end - def ocr_request_params { filename: params["file"].original_filename, diff --git a/app/jobs/pdf_ocr_job.rb b/app/jobs/pdf_ocr_job.rb index 87e1b2808..a0cfe31d2 100644 --- a/app/jobs/pdf_ocr_job.rb +++ b/app/jobs/pdf_ocr_job.rb @@ -23,15 +23,16 @@ def pdf_attached? def run_pdf_ocr blob.open do |file| - _stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, out_path.to_s) - return true if status.success? - update_state(state: "Error", message: "PDF OCR job failed: #{error_str}") - - # Copy orginal file to destination without OCR - FileUtils.cp file.path, out_path.to_s + _stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, temporary_file.path.to_s) + if status.success? + transfer_file(temporary_file.path.to_s) + true + else + update_state(state: "Error", message: "PDF OCR job failed: #{error_str}") + transfer_file(file.path) + false + end end - - false end def update_state(state:, message: nil) @@ -40,15 +41,23 @@ def update_state(state:, message: nil) resource.save end - def out_path - File.join(ocr_out_dir, resource.filename) + def temporary_file + @temporary_file ||= Tempfile.new end - def ocr_out_dir - @ocr_out_dir ||= begin - path = Figgy.config["ocr_out_path"] - FileUtils.mkdir_p(path) unless File.directory?(path) - path - end + def transfer_file(path) + host = Figgy.config["illiad_sftp_host"] + user = Figgy.config["illiad_sftp_user"] + pass = Figgy.config["illiad_sftp_pass"] + port = Figgy.config["illiad_sftp_port"] + out_path = File.join(Figgy.config["illiad_sftp_path"], "pdf", resource.filename) + + begin + sftp = Net::SFTP.start(host, user, { password: pass, port: port }) + sftp.upload!(path, out_path) + ensure + sftp.close_channel + sftp.session.close + end end end diff --git a/config/config.yml b/config/config.yml index 8e4e7a249..362de4ed6 100644 --- a/config/config.yml +++ b/config/config.yml @@ -8,9 +8,11 @@ defaults: &defaults plausible_api_key: <%= ENV["PLAUSIBLE_API_KEY"] %> cdl_in_path: <%= ENV["CDL_IN_PATH"] %> ocr_in_path: <%= ENV["OCR_IN_PATH"] %> - ocr_out_path: <%= ENV["OCR_OUT_PATH"] %> - illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] %> - illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] %> + illiad_sftp_host: <%= ENV["ILLIAD_SFTP_HOST"] || "sftp.example.com" %> + illiad_sftp_port: <%= ENV["ILLIAD_SFTP_PORT"] || "2222" %> + illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] || "user" %> + illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] || "password" %> + illiad_sftp_path: <%= ENV["ILLIAD_SFTP_BASE_PATH"] || "/illiad" %> pyramidals_bucket: "iiif-image-staging" pyramidals_region: "us-east-1" aws_access_key_id: <%= ENV["FIGGY_AWS_ACCESS_KEY_ID"] %> @@ -239,7 +241,6 @@ development: <<: *defaults cdl_in_path: <%= Rails.root.join("tmp", "cdl_in") %> ocr_in_path: <%= Rails.root.join("tmp", "ocr_in") %> - ocr_out_path: <%= Rails.root.join("tmp", "ocr_out") %> repository_path: <%= Rails.root.join("tmp", "more_files") %> pyramidals_bucket: <%= ENV["FIGGY_PYRAMIDALS_BUCKET"] %> cloud_geo_bucket: "figgy-geo-staging" @@ -256,7 +257,6 @@ test: plausible_api_key: "plausible_api_key" cdl_in_path: <%= Rails.root.join("tmp", "test_cdl_in") %> ocr_in_path: <%= Rails.root.join("tmp", "test_ocr_in#{ENV["TEST_ENV_NUMBER"]}") %> - ocr_out_path: <%= Rails.root.join("tmp", "test_ocr_out#{ENV["TEST_ENV_NUMBER"]}") %> repository_path: <%= Rails.root.join("tmp", "test_files#{ENV["TEST_ENV_NUMBER"]}") %> derivative_path: <%= Rails.root.join("tmp", "test_derivatives#{ENV["TEST_ENV_NUMBER"]}") %> pyramidal_derivative_path: <%= Rails.root.join("tmp", "test_pyramidal_derivatives#{ENV["TEST_ENV_NUMBER"]}") %> diff --git a/spec/jobs/pdf_ocr_job_spec.rb b/spec/jobs/pdf_ocr_job_spec.rb index e816d291c..e95cf2757 100644 --- a/spec/jobs/pdf_ocr_job_spec.rb +++ b/spec/jobs/pdf_ocr_job_spec.rb @@ -3,27 +3,26 @@ RSpec.describe PdfOcrJob do describe "#perform" do - let(:out_dir) { Figgy.config["ocr_out_path"] } - let(:out_path) { File.join(out_dir, "sample.pdf") } + let(:ssh_session) { instance_double(Net::SSH::Connection::Session) } + let(:sftp_session) { instance_double(Net::SFTP::Session) } let(:resource) { FactoryBot.create(:ocr_request, file: fixture_path) } before do - # Create tmp ocr out directory - FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir) - end - - after do - # Cleanup PDFs - File.delete(out_path) if File.exist?(out_path) + allow(Net::SFTP).to receive(:start).and_return(sftp_session) + allow(sftp_session).to receive(:upload!) + allow(sftp_session).to receive(:close_channel) + allow(sftp_session).to receive(:session).and_return(ssh_session) + allow(ssh_session).to receive(:close) end context "with a valid PDF" do let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "sample.pdf") } - it "creates on OCRed PDF in an output directory and deletes the attached PDF" do - expect { described_class.perform_now(resource: resource) } - .to change { File.exist?(out_path) } - .from(false).to(true) + it "creates on OCRed PDF, uploads the file to the Illiad SFTP server, and deletes the attached PDF" do + described_class.perform_now(resource: resource) + expect(sftp_session).to have_received(:upload!) + expect(sftp_session).to have_received(:close_channel) + expect(ssh_session).to have_received(:close) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Complete" expect(ocr_request.pdf.attached?).to be false @@ -33,12 +32,12 @@ context "with a PDF that can't be OCRed" do let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "bad.pdf") } - it "saves error on the ocr request resource and copies original file to out path" do + it "saves error on the ocr request resource and uploads the original file to the Illiad SFTP server" do described_class.perform_now(resource: resource) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Error" expect(ocr_request.note).to include "PDF OCR job failed" - expect(File.exist?(out_path)).to be true + expect(sftp_session).to have_received(:upload!) expect(ocr_request.pdf.attached?).to be false end end From 21a0fc0329146874f155a80d8133bd776b49836e Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Thu, 19 Dec 2024 11:22:51 -0600 Subject: [PATCH 5/6] Pass block to SFTP.start method --- app/jobs/pdf_ocr_job.rb | 6 +----- spec/jobs/pdf_ocr_job_spec.rb | 10 ++-------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/app/jobs/pdf_ocr_job.rb b/app/jobs/pdf_ocr_job.rb index a0cfe31d2..5cc47a7a9 100644 --- a/app/jobs/pdf_ocr_job.rb +++ b/app/jobs/pdf_ocr_job.rb @@ -52,12 +52,8 @@ def transfer_file(path) port = Figgy.config["illiad_sftp_port"] out_path = File.join(Figgy.config["illiad_sftp_path"], "pdf", resource.filename) - begin - sftp = Net::SFTP.start(host, user, { password: pass, port: port }) + Net::SFTP.start(host, user, { password: pass, port: port }) do |sftp| sftp.upload!(path, out_path) - ensure - sftp.close_channel - sftp.session.close end end end diff --git a/spec/jobs/pdf_ocr_job_spec.rb b/spec/jobs/pdf_ocr_job_spec.rb index e95cf2757..630d58fcb 100644 --- a/spec/jobs/pdf_ocr_job_spec.rb +++ b/spec/jobs/pdf_ocr_job_spec.rb @@ -3,16 +3,12 @@ RSpec.describe PdfOcrJob do describe "#perform" do - let(:ssh_session) { instance_double(Net::SSH::Connection::Session) } let(:sftp_session) { instance_double(Net::SFTP::Session) } let(:resource) { FactoryBot.create(:ocr_request, file: fixture_path) } before do - allow(Net::SFTP).to receive(:start).and_return(sftp_session) + allow(Net::SFTP).to receive(:start).and_yield(sftp_session) allow(sftp_session).to receive(:upload!) - allow(sftp_session).to receive(:close_channel) - allow(sftp_session).to receive(:session).and_return(ssh_session) - allow(ssh_session).to receive(:close) end context "with a valid PDF" do @@ -20,9 +16,7 @@ it "creates on OCRed PDF, uploads the file to the Illiad SFTP server, and deletes the attached PDF" do described_class.perform_now(resource: resource) - expect(sftp_session).to have_received(:upload!) - expect(sftp_session).to have_received(:close_channel) - expect(ssh_session).to have_received(:close) + expect(sftp_session).to have_received(:upload!).with(/.*/, /pdf\/sample\.pdf/) ocr_request = OcrRequest.all.first expect(ocr_request.state).to eq "Complete" expect(ocr_request.pdf.attached?).to be false From b3040409c5581ed35829ce69d3ee5b784bbb79f4 Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Thu, 19 Dec 2024 15:37:55 -0600 Subject: [PATCH 6/6] Update ocr in path value --- config/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.yml b/config/config.yml index 362de4ed6..67c2841ab 100644 --- a/config/config.yml +++ b/config/config.yml @@ -7,7 +7,7 @@ defaults: &defaults archivespace_password: <%= ENV["ASPACE_PASSWORD"] %> plausible_api_key: <%= ENV["PLAUSIBLE_API_KEY"] %> cdl_in_path: <%= ENV["CDL_IN_PATH"] %> - ocr_in_path: <%= ENV["OCR_IN_PATH"] %> + ocr_in_path: <%= ENV["OCR_ILLIAD_IN_PATH"] %> illiad_sftp_host: <%= ENV["ILLIAD_SFTP_HOST"] || "sftp.example.com" %> illiad_sftp_port: <%= ENV["ILLIAD_SFTP_PORT"] || "2222" %> illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] || "user" %>