Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates for new Illiad mount and SFTP server #6576

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ gem "mime-types"
gem "mini_magick"
gem "modernizr-rails"
# Pin because capistrano raises an error at >= 7.2
gem "net-sftp"
gem "net-ssh", "~> 7.1.0"
gem "normalize-rails"
gem "oai"
Expand Down
1 change: 1 addition & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1326,6 +1326,7 @@ DEPENDENCIES
modernizr-rails
net-imap
net-pop
net-sftp
net-smtp
net-ssh (~> 7.1.0)
normalize-rails
Expand Down
8 changes: 1 addition & 7 deletions app/controllers/ocr_requests_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def upload_file
authorize! :update, OcrRequest.new
@ocr_request = OcrRequest.new(ocr_request_params)
if @ocr_request.save
PdfOcrJob.perform_later(resource: @ocr_request, out_path: ocr_out_file)
PdfOcrJob.perform_later(resource: @ocr_request)
render status: :ok, json: { message: "uploaded" }
else
render status: :unprocessable_entity, json: @ocr_request.errors
Expand All @@ -30,12 +30,6 @@ def upload_file

private

def ocr_out_file
out_dir = Figgy.config["ocr_out_path"]
FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir)
File.join(out_dir, @ocr_request.filename)
end

def ocr_request_params
{
filename: params["file"].original_filename,
Expand Down
10 changes: 1 addition & 9 deletions app/jobs/create_ocr_request_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,7 @@ def perform(file_path:)
ocr_request = OcrRequest.new(filename: filename, state: "Enqueued")
ocr_request.save!
ocr_request.pdf.attach(io: File.open(file_path), filename: filename, content_type: "application/pdf")
out_path = File.join(ocr_out_dir, filename)
PdfOcrJob.perform_later(resource: ocr_request, out_path: out_path)
PdfOcrJob.perform_later(resource: ocr_request)
File.delete(file_path)
end

def ocr_out_dir
out_dir = Figgy.config["ocr_out_path"]
FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir)

out_dir
end
end
38 changes: 27 additions & 11 deletions app/jobs/pdf_ocr_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@

class PdfOcrJob < ApplicationJob
queue_as :high
attr_reader :blob, :out_path, :resource
attr_reader :blob, :resource

def perform(resource:, out_path:)
def perform(resource:)
logger.info("PDF OCR job initiated for: #{resource.filename}")
@resource = resource
@out_path = out_path
@blob = resource.pdf # Required for ActiveStorage blob to tempfile method.
update_state(state: "Processing")
return unless pdf_attached?
Expand All @@ -24,20 +23,37 @@ def pdf_attached?

def run_pdf_ocr
blob.open do |file|
_stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, out_path.to_s)
return true if status.success?
update_state(state: "Error", message: "PDF OCR job failed: #{error_str}")

# Copy orginal file to destination without OCR
FileUtils.cp file.path, out_path.to_s
_stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, temporary_file.path.to_s)
if status.success?
transfer_file(temporary_file.path.to_s)
true
else
update_state(state: "Error", message: "PDF OCR job failed: #{error_str}")
transfer_file(file.path)
false
end
end

false
end

def update_state(state:, message: nil)
resource.state = state
resource.note = message if message
resource.save
end

def temporary_file
@temporary_file ||= Tempfile.new
end

def transfer_file(path)
host = Figgy.config["illiad_sftp_host"]
user = Figgy.config["illiad_sftp_user"]
pass = Figgy.config["illiad_sftp_pass"]
port = Figgy.config["illiad_sftp_port"]
out_path = File.join(Figgy.config["illiad_sftp_path"], "pdf", resource.filename)

Net::SFTP.start(host, user, { password: pass, port: port }) do |sftp|
sftp.upload!(path, out_path)
end
end
end
10 changes: 6 additions & 4 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ defaults: &defaults
archivespace_password: <%= ENV["ASPACE_PASSWORD"] %>
plausible_api_key: <%= ENV["PLAUSIBLE_API_KEY"] %>
cdl_in_path: <%= ENV["CDL_IN_PATH"] %>
ocr_in_path: <%= ENV["OCR_IN_PATH"] %>
ocr_out_path: <%= ENV["OCR_OUT_PATH"] %>
ocr_in_path: <%= ENV["OCR_ILLIAD_IN_PATH"] %>
illiad_sftp_host: <%= ENV["ILLIAD_SFTP_HOST"] || "sftp.example.com" %>
illiad_sftp_port: <%= ENV["ILLIAD_SFTP_PORT"] || "2222" %>
illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] || "user" %>
illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] || "password" %>
illiad_sftp_path: <%= ENV["ILLIAD_SFTP_BASE_PATH"] || "/illiad" %>
pyramidals_bucket: "iiif-image-staging"
pyramidals_region: "us-east-1"
aws_access_key_id: <%= ENV["FIGGY_AWS_ACCESS_KEY_ID"] %>
Expand Down Expand Up @@ -237,7 +241,6 @@ development:
<<: *defaults
cdl_in_path: <%= Rails.root.join("tmp", "cdl_in") %>
ocr_in_path: <%= Rails.root.join("tmp", "ocr_in") %>
ocr_out_path: <%= Rails.root.join("tmp", "ocr_out") %>
repository_path: <%= Rails.root.join("tmp", "more_files") %>
pyramidals_bucket: <%= ENV["FIGGY_PYRAMIDALS_BUCKET"] %>
cloud_geo_bucket: "figgy-geo-staging"
Expand All @@ -254,7 +257,6 @@ test:
plausible_api_key: "plausible_api_key"
cdl_in_path: <%= Rails.root.join("tmp", "test_cdl_in") %>
ocr_in_path: <%= Rails.root.join("tmp", "test_ocr_in#{ENV["TEST_ENV_NUMBER"]}") %>
ocr_out_path: <%= Rails.root.join("tmp", "test_ocr_out#{ENV["TEST_ENV_NUMBER"]}") %>
repository_path: <%= Rails.root.join("tmp", "test_files#{ENV["TEST_ENV_NUMBER"]}") %>
derivative_path: <%= Rails.root.join("tmp", "test_derivatives#{ENV["TEST_ENV_NUMBER"]}") %>
pyramidal_derivative_path: <%= Rails.root.join("tmp", "test_pyramidal_derivatives#{ENV["TEST_ENV_NUMBER"]}") %>
Expand Down
27 changes: 10 additions & 17 deletions spec/jobs/pdf_ocr_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,20 @@

RSpec.describe PdfOcrJob do
describe "#perform" do
let(:out_dir) { Figgy.config["ocr_out_path"] }
let(:out_path) { File.join(out_dir, "ocr-sample.pdf") }
let(:sftp_session) { instance_double(Net::SFTP::Session) }
let(:resource) { FactoryBot.create(:ocr_request, file: fixture_path) }

before do
# Create tmp ocr out directory
FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir)
end

after do
# Cleanup PDFs
File.delete(out_path) if File.exist?(out_path)
allow(Net::SFTP).to receive(:start).and_yield(sftp_session)
allow(sftp_session).to receive(:upload!)
end

context "with a valid PDF" do
let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "sample.pdf") }

it "creates on OCRed PDF in an output directory and deletes the attached PDF" do
expect { described_class.perform_now(resource: resource, out_path: out_path) }
.to change { File.exist?(out_path) }
.from(false).to(true)
it "creates on OCRed PDF, uploads the file to the Illiad SFTP server, and deletes the attached PDF" do
described_class.perform_now(resource: resource)
expect(sftp_session).to have_received(:upload!).with(/.*/, /pdf\/sample\.pdf/)
ocr_request = OcrRequest.all.first
expect(ocr_request.state).to eq "Complete"
expect(ocr_request.pdf.attached?).to be false
Expand All @@ -33,12 +26,12 @@
context "with a PDF that can't be OCRed" do
let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "bad.pdf") }

it "saves error on the ocr request resource and copies original file to out path" do
described_class.perform_now(resource: resource, out_path: out_path)
it "saves error on the ocr request resource and uploads the original file to the Illiad SFTP server" do
described_class.perform_now(resource: resource)
ocr_request = OcrRequest.all.first
expect(ocr_request.state).to eq "Error"
expect(ocr_request.note).to include "PDF OCR job failed"
expect(File.exist?(out_path)).to be true
expect(sftp_session).to have_received(:upload!)
expect(ocr_request.pdf.attached?).to be false
end
end
Expand All @@ -47,7 +40,7 @@
let(:resource) { FactoryBot.create(:ocr_request) }

it "adds an error message to the ocr request resource" do
described_class.perform_now(resource: resource, out_path: out_path)
described_class.perform_now(resource: resource)
ocr_request = OcrRequest.all.first
expect(ocr_request.state).to eq "Error"
expect(ocr_request.note).to include "Resource has no attached PDF"
Expand Down