Skip to content

Commit

Permalink
Upload PDFs to Illiad SFTP server
Browse files Browse the repository at this point in the history
  • Loading branch information
eliotjordan committed Dec 18, 2024
1 parent 7733e99 commit 1ef0f79
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 43 deletions.
8 changes: 1 addition & 7 deletions app/controllers/ocr_requests_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def upload_file
authorize! :update, OcrRequest.new
@ocr_request = OcrRequest.new(ocr_request_params)
if @ocr_request.save
PdfOcrJob.perform_later(resource: @ocr_request, out_path: ocr_out_file)
PdfOcrJob.perform_later(resource: @ocr_request)
render status: :ok, json: { message: "uploaded" }
else
render status: :unprocessable_entity, json: @ocr_request.errors
Expand All @@ -30,12 +30,6 @@ def upload_file

private

def ocr_out_file
out_dir = Figgy.config["ocr_out_path"]
FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir)
File.join(out_dir, @ocr_request.filename)
end

def ocr_request_params
{
filename: params["file"].original_filename,
Expand Down
41 changes: 25 additions & 16 deletions app/jobs/pdf_ocr_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,16 @@ def pdf_attached?

def run_pdf_ocr
blob.open do |file|
_stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, out_path.to_s)
return true if status.success?
update_state(state: "Error", message: "PDF OCR job failed: #{error_str}")

# Copy orginal file to destination without OCR
FileUtils.cp file.path, out_path.to_s
_stdout_str, error_str, status = Open3.capture3("ocrmypdf", "--force-ocr", "--rotate-pages", "--deskew", file.path, temporary_file.path.to_s)
if status.success?
transfer_file(temporary_file.path.to_s)
true
else
update_state(state: "Error", message: "PDF OCR job failed: #{error_str}")
transfer_file(file.path)
false
end
end

false
end

def update_state(state:, message: nil)
Expand All @@ -40,15 +41,23 @@ def update_state(state:, message: nil)
resource.save
end

def out_path
File.join(ocr_out_dir, resource.filename)
def temporary_file
@temporary_file ||= Tempfile.new
end

def ocr_out_dir
@ocr_out_dir ||= begin
path = Figgy.config["ocr_out_path"]
FileUtils.mkdir_p(path) unless File.directory?(path)
path
end
def transfer_file(path)
host = Figgy.config["illiad_sftp_host"]
user = Figgy.config["illiad_sftp_user"]
pass = Figgy.config["illiad_sftp_pass"]
port = Figgy.config["illiad_sftp_port"]
out_path = File.join(Figgy.config["illiad_sftp_path"], "pdf", resource.filename)

begin
sftp = Net::SFTP.start(host, user, { password: pass, port: port })
sftp.upload!(path, out_path)
ensure
sftp.close_channel
sftp.session.close
end
end
end
10 changes: 5 additions & 5 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ defaults: &defaults
plausible_api_key: <%= ENV["PLAUSIBLE_API_KEY"] %>
cdl_in_path: <%= ENV["CDL_IN_PATH"] %>
ocr_in_path: <%= ENV["OCR_IN_PATH"] %>
ocr_out_path: <%= ENV["OCR_OUT_PATH"] %>
illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] %>
illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] %>
illiad_sftp_host: <%= ENV["ILLIAD_SFTP_HOST"] || "sftp.example.com" %>
illiad_sftp_port: <%= ENV["ILLIAD_SFTP_PORT"] || "2222" %>
illiad_sftp_user: <%= ENV["ILLIAD_SFTP_USER"] || "user" %>
illiad_sftp_pass: <%= ENV["ILLIAD_SFTP_PASSWORD"] || "password" %>
illiad_sftp_path: <%= ENV["ILLIAD_SFTP_BASE_PATH"] || "/illiad" %>
pyramidals_bucket: "iiif-image-staging"
pyramidals_region: "us-east-1"
aws_access_key_id: <%= ENV["FIGGY_AWS_ACCESS_KEY_ID"] %>
Expand Down Expand Up @@ -239,7 +241,6 @@ development:
<<: *defaults
cdl_in_path: <%= Rails.root.join("tmp", "cdl_in") %>
ocr_in_path: <%= Rails.root.join("tmp", "ocr_in") %>
ocr_out_path: <%= Rails.root.join("tmp", "ocr_out") %>
repository_path: <%= Rails.root.join("tmp", "more_files") %>
pyramidals_bucket: <%= ENV["FIGGY_PYRAMIDALS_BUCKET"] %>
cloud_geo_bucket: "figgy-geo-staging"
Expand All @@ -256,7 +257,6 @@ test:
plausible_api_key: "plausible_api_key"
cdl_in_path: <%= Rails.root.join("tmp", "test_cdl_in") %>
ocr_in_path: <%= Rails.root.join("tmp", "test_ocr_in#{ENV["TEST_ENV_NUMBER"]}") %>
ocr_out_path: <%= Rails.root.join("tmp", "test_ocr_out#{ENV["TEST_ENV_NUMBER"]}") %>
repository_path: <%= Rails.root.join("tmp", "test_files#{ENV["TEST_ENV_NUMBER"]}") %>
derivative_path: <%= Rails.root.join("tmp", "test_derivatives#{ENV["TEST_ENV_NUMBER"]}") %>
pyramidal_derivative_path: <%= Rails.root.join("tmp", "test_pyramidal_derivatives#{ENV["TEST_ENV_NUMBER"]}") %>
Expand Down
29 changes: 14 additions & 15 deletions spec/jobs/pdf_ocr_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,26 @@

RSpec.describe PdfOcrJob do
describe "#perform" do
let(:out_dir) { Figgy.config["ocr_out_path"] }
let(:out_path) { File.join(out_dir, "sample.pdf") }
let(:ssh_session) { instance_double(Net::SSH::Connection::Session) }
let(:sftp_session) { instance_double(Net::SFTP::Session) }
let(:resource) { FactoryBot.create(:ocr_request, file: fixture_path) }

before do
# Create tmp ocr out directory
FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir)
end

after do
# Cleanup PDFs
File.delete(out_path) if File.exist?(out_path)
allow(Net::SFTP).to receive(:start).and_return(sftp_session)
allow(sftp_session).to receive(:upload!)
allow(sftp_session).to receive(:close_channel)
allow(sftp_session).to receive(:session).and_return(ssh_session)
allow(ssh_session).to receive(:close)
end

context "with a valid PDF" do
let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "sample.pdf") }

it "creates on OCRed PDF in an output directory and deletes the attached PDF" do
expect { described_class.perform_now(resource: resource) }
.to change { File.exist?(out_path) }
.from(false).to(true)
it "creates on OCRed PDF, uploads the file to the Illiad SFTP server, and deletes the attached PDF" do
described_class.perform_now(resource: resource)
expect(sftp_session).to have_received(:upload!)
expect(sftp_session).to have_received(:close_channel)
expect(ssh_session).to have_received(:close)
ocr_request = OcrRequest.all.first
expect(ocr_request.state).to eq "Complete"
expect(ocr_request.pdf.attached?).to be false
Expand All @@ -33,12 +32,12 @@
context "with a PDF that can't be OCRed" do
let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "bad.pdf") }

it "saves error on the ocr request resource and copies original file to out path" do
it "saves error on the ocr request resource and uploads the original file to the Illiad SFTP server" do
described_class.perform_now(resource: resource)
ocr_request = OcrRequest.all.first
expect(ocr_request.state).to eq "Error"
expect(ocr_request.note).to include "PDF OCR job failed"
expect(File.exist?(out_path)).to be true
expect(sftp_session).to have_received(:upload!)
expect(ocr_request.pdf.attached?).to be false
end
end
Expand Down

0 comments on commit 1ef0f79

Please sign in to comment.