Skip to content

Commit

Permalink
Refactor ocr jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
eliotjordan committed Dec 18, 2024
1 parent 6737ab4 commit 7733e99
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ gem "mime-types"
gem "mini_magick"
gem "modernizr-rails"
# Pin because capistrano raises an error at >= 7.2
gem "net-ssh", "~> 7.1.0"
gem "net-sftp"
gem "net-ssh", "~> 7.1.0"
gem "normalize-rails"
gem "oai"
gem "omniauth", "1.9.2"
Expand Down
10 changes: 1 addition & 9 deletions app/jobs/create_ocr_request_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,7 @@ def perform(file_path:)
ocr_request = OcrRequest.new(filename: filename, state: "Enqueued")
ocr_request.save!
ocr_request.pdf.attach(io: File.open(file_path), filename: filename, content_type: "application/pdf")
out_path = File.join(ocr_out_dir, filename)
PdfOcrJob.perform_later(resource: ocr_request, out_path: out_path)
PdfOcrJob.perform_later(resource: ocr_request)
File.delete(file_path)
end

def ocr_out_dir
out_dir = Figgy.config["ocr_out_path"]
FileUtils.mkdir_p(out_dir) unless File.directory?(out_dir)

out_dir
end
end
17 changes: 14 additions & 3 deletions app/jobs/pdf_ocr_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@

class PdfOcrJob < ApplicationJob
queue_as :high
attr_reader :blob, :out_path, :resource
attr_reader :blob, :resource

def perform(resource:, out_path:)
def perform(resource:)
logger.info("PDF OCR job initiated for: #{resource.filename}")
@resource = resource
@out_path = out_path
@blob = resource.pdf # Required for ActiveStorage blob to tempfile method.
update_state(state: "Processing")
return unless pdf_attached?
Expand Down Expand Up @@ -40,4 +39,16 @@ def update_state(state:, message: nil)
resource.note = message if message
resource.save
end

def out_path
File.join(ocr_out_dir, resource.filename)
end

def ocr_out_dir
@ocr_out_dir ||= begin
path = Figgy.config["ocr_out_path"]
FileUtils.mkdir_p(path) unless File.directory?(path)
path
end
end
end
8 changes: 4 additions & 4 deletions spec/jobs/pdf_ocr_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
RSpec.describe PdfOcrJob do
describe "#perform" do
let(:out_dir) { Figgy.config["ocr_out_path"] }
let(:out_path) { File.join(out_dir, "ocr-sample.pdf") }
let(:out_path) { File.join(out_dir, "sample.pdf") }
let(:resource) { FactoryBot.create(:ocr_request, file: fixture_path) }

before do
Expand All @@ -21,7 +21,7 @@
let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "sample.pdf") }

it "creates on OCRed PDF in an output directory and deletes the attached PDF" do
expect { described_class.perform_now(resource: resource, out_path: out_path) }
expect { described_class.perform_now(resource: resource) }
.to change { File.exist?(out_path) }
.from(false).to(true)
ocr_request = OcrRequest.all.first
Expand All @@ -34,7 +34,7 @@
let(:fixture_path) { Rails.root.join("spec", "fixtures", "files", "bad.pdf") }

it "saves error on the ocr request resource and copies original file to out path" do
described_class.perform_now(resource: resource, out_path: out_path)
described_class.perform_now(resource: resource)
ocr_request = OcrRequest.all.first
expect(ocr_request.state).to eq "Error"
expect(ocr_request.note).to include "PDF OCR job failed"
Expand All @@ -47,7 +47,7 @@
let(:resource) { FactoryBot.create(:ocr_request) }

it "adds an error message to the ocr request resource" do
described_class.perform_now(resource: resource, out_path: out_path)
described_class.perform_now(resource: resource)
ocr_request = OcrRequest.all.first
expect(ocr_request.state).to eq "Error"
expect(ocr_request.note).to include "Resource has no attached PDF"
Expand Down

0 comments on commit 7733e99

Please sign in to comment.