This repository has been archived by the owner on May 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hocr_step.rb
67 lines (58 loc) · 2.38 KB
/
hocr_step.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# frozen_string_literal: true
module Derivative
module Rodeo
module Step
##
# Responsible for finding or creating a hocr file (or configured :output_suffix) using
# tesseract.
#
# @see http://tesseract-ocr.github.io
#
# From `tesseract -h`
#
# Usage:
# tesseract --help | --help-extra | --version
# tesseract --list-langs
# tesseract imagename outputbase [options...] [configfile...]
class HocrStep < BaseStep
self.prerequisites = [:monochrome]
##
# @!group Class Attributes
# @!attribute [rw]
# Command arena variables to for tesseract command; default `nil`.
#
# @example
# Derivative::Rodeo::Step::HocrStep.command_environment_variables = "OMP_THREAD_LIMIT=1"
class_attribute :command_environment_variables, default: nil
##
# @!attribute [rw]
# Additional options to send to tesseract command; default `nil`.
class_attribute :additional_tessearct_options, default: nil
##
# @!attribute [rw]
# The tesseract command's output base; default `:hocr`.
class_attribute :output_suffix, default: :hocr
# @!endgroup
##
# @raise [Exceptions::DerivativeNotFoundError] when we don't have a :monochrome {Step} or
# we failed to generate the :hocr file.
def generate
monochrome_path = arena.local_demand_path_for!(derivative: :monochrome)
# I'm assuming that if the arena returns a local path for a filename, then the
# process can write a file to the same directory as the returned filename. Because
# tesseract takes a base name (e.g. base-hocr) and writes "base-hocr.hocr".
#
# TODO: Going to need to consider how we use the path_for_shell_commands here.
output_prefix = arena.local_path(derivative: to_sym)
cmd = ""
cmd += command_environment_variables + " " if command_environment_variables.present?
cmd += "tesseract #{monochrome_path} #{output_prefix}"
cmd += " #{additional_tessearct_options}" if additional_tessearct_options.present?
cmd += " #{output_suffix}"
local_run_command!(cmd)
arena.local_assign!(derivative: to_sym, path: "#{output_prefix}.#{output_suffix}")
end
end
end
end
end