This repository has been archived by the owner on May 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_split_step.rb
90 lines (80 loc) · 3.17 KB
/
pdf_split_step.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# frozen_string_literal: true
module Derivative
module Rodeo
module Step
##
# The :pdf_split derivative processes one file and creates additional files which have some
# behavior similar to originals in that they have their own processing chain.
class PdfSplitStep < BaseStep
##
# @!group Class Attributes
#
# @!attribute [rw]
# @return [#call]
#
# The call function should receive a string and return an Enumerable that yields the page's
# path.
#
# @see #generate
class_attribute :pdf_splitter_name, default: :tiff
##
# @!attribute [rw]
# @return [Symbol, #to_sym]
#
# This step will be working on a parent chain, with a directory likely to be
# `<work_identifier>/<file_set_filename>` (see
# {Manifest::PreProcess::Identifier#directory_slugs}). In that directory will be a file
# `base_file_for_chain`; that will be the original PDF file.
#
# When we split the PDFs we will write each page to the follow path, relative to the parent:
# `<first_spawn_step_name>/<index>/base_file_for_chain`
#
# The first_spawn_step_name will be the first step in the split chain, and is responsible
# for ensuring that the split chain has the page image for later processing (e.g. :page_ocr)
class_attribute :first_spawn_step_name, default: :page_image, instance_writer: false
##
# In this case the base_file_for_chain likely represents that original PDF.
self.prerequisites = [:base_file_for_chain]
self.spawns = [first_spawn_step_name, :hocr]
# @!endgroup Class Attributes
##
# @return [#call, Utilities::PdfSplitter::Base]
def pdf_splitter
@pdf_splitter ||= Utilities::PdfSplitter.for(pdf_splitter_name)
end
##
# @api private
#
# @note Provided as a convenience method for testing.
attr_writer :pdf_splitter
def generate
# We need to write the file to the :page_image
pdf_splitter.call(base_file_for_chain_path).each_with_index do |path, index|
process_page_split!(path: path, index: index)
end
end
##
# Given that the {PdfSplitStep} spawns many new processes (see #generate), we don't have a
# single "derivative" per se. So we need to apply a different kind of logic. Namely do we
# have the directory that houses the split pages (as defined by the configured
# first_spawn_step_name).
#
# @param storage [StorageAdapters::Base]
def self.demand_path_for!(storage:)
storage.directory_exists?(first_spawn_step_name)
end
private
def process_page_split!(path:, index:)
derived_arena = Derivative::Rodeo::Arena.for_derived(
parent_arena: arena,
path_to_base_file_for_chain: path,
first_spawn_step_name: first_spawn_step_name.to_sym,
index: index,
derivatives: spawns
)
derived_arena.start_processing!
end
end
end
end
end