This repository has been archived by the owner on May 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Begin moving IIIF Print splitting strategies into the code base.
- Loading branch information
Showing
8 changed files
with
171 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ PATH | |
remote: . | ||
specs: | ||
space_stone-pdf_splitter (0.1.0) | ||
activesupport (>= 5) | ||
mini_magick | ||
|
||
GEM | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# frozen_string_literal: true | ||
|
||
module SpaceStone | ||
module PdfSplitter | ||
# Namespace for declaring strategies for splitting PDFs. | ||
module Strategies | ||
end | ||
end | ||
end | ||
|
||
require 'space_stone/pdf_splitter/strategies/base' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'open3' | ||
require 'securerandom' | ||
require 'tmpdir' | ||
require 'active_support/core_ext/class/attribute' | ||
|
||
module SpaceStone | ||
module PdfSplitter | ||
module Strategies | ||
## | ||
# @abstract | ||
# | ||
# The purpose of this class is to split the PDF into constituent image files. | ||
# | ||
# @see #each | ||
class Base | ||
class_attribute :image_extension | ||
class_attribute :default_dpi, default: 400 | ||
# Should we perform compression logic on the images? | ||
class_attribute :compression, default: nil | ||
# What is the image quality we're using? | ||
class_attribute :quality, default: nil | ||
|
||
## | ||
# @param path [String] the path to the source PDF that we're processing. | ||
# @param baseid [String] used for creating a unique identifier | ||
# @param tmpdir [String] place to perform the "work" of splitting the PDF. | ||
# | ||
# @param pdf_pages_summary [SpaceStone::PdfSplitter::PdfPagesSummary] by default we'll | ||
# extract this from the given path, but for testing purposes, you might want to | ||
# provide a specific summary. | ||
def initialize(path, baseid: SecureRandom.uuid, tmpdir: Dir.mktmpdir, pdf_pages_summary: PdfPagesSummary.extract(path: path)) | ||
@baseid = baseid | ||
@pdfpath = path | ||
@pdf_pages_summary = pdf_pages_summary | ||
@tmpdir = tmpdir | ||
end | ||
|
||
# In creating {#each} we get many of the methods of array operation (e.g. #to_a). | ||
include Enumerable | ||
|
||
## | ||
# @api public | ||
# | ||
# @yieldparam [String] the path to the page's tiff. | ||
def each | ||
entries.each do |e| | ||
yield(e) | ||
end | ||
end | ||
|
||
# @api private | ||
def invalid_pdf? | ||
!pdf_pages_summary.valid? | ||
end | ||
|
||
attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath | ||
private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath | ||
|
||
private | ||
|
||
# entries for each page | ||
def entries | ||
return @entries if defined? @entries | ||
|
||
@entries = Array.wrap(gsconvert) | ||
end | ||
|
||
# rubocop:disable Metrics/MethodLength | ||
def gsconvert | ||
output_base = File.join(tmpdir, "#{baseid}-page%d.#{image_extension}") | ||
# NOTE: you must call gsdevice before compression, as compression is | ||
# updated during the gsdevice call. | ||
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4" | ||
cmd += " -sCompression=#{compression}" if compression? | ||
cmd += " -dJPEGQ=#{quality}" if quality? | ||
cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}" | ||
filenames = [] | ||
|
||
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr| | ||
page_number = 0 | ||
stdout.read.split("\n").each do |line| | ||
next unless line.start_with?('Page ') | ||
|
||
page_number += 1 | ||
filenames << File.join(tmpdir, "#{baseid}-page#{page_number}.#{image_extension}") | ||
end | ||
end | ||
|
||
filenames | ||
end | ||
# rubocop:enable Metrics/MethodLength | ||
|
||
def gsdevice | ||
raise NotImplementedError | ||
end | ||
|
||
PAGE_COUNT_REGEXP = %r{^Pages: +(\d+)$}.freeze | ||
|
||
def pagecount | ||
return @pagecount if defined? @pagecount | ||
|
||
cmd = "pdfinfo #{pdfpath}" | ||
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr| | ||
match = PAGE_COUNT_REGEXP.match(stdout.read) | ||
@pagecount = match[1].to_i | ||
end | ||
@pagecount | ||
end | ||
|
||
def ppi | ||
if looks_scanned? | ||
# For scanned media, defer to detected image PPI: | ||
pdf_pages_summary.ppi | ||
else | ||
# 400 dpi for something that does not look like scanned media: | ||
default_dpi | ||
end | ||
end | ||
|
||
def looks_scanned? | ||
max_image_px = pdf_pages_summary.width * pdf_pages_summary.height | ||
# single 10mp+ image per page? | ||
single_image_per_page? && max_image_px > 1024 * 1024 * 10 | ||
end | ||
|
||
def single_image_per_page? | ||
pdf_pages_summary.page_count == pagecount | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# frozen_string_literal: true | ||
|
||
RSpec.describe SpaceStone::PdfSplitter::Strategies::Base do | ||
subject { described_class.new(__FILE__, pdf_pages_summary: pdf_pages_summary) } | ||
let(:pdf_pages_summary) { double(SpaceStone::PdfSplitter::PdfPagesSummary) } | ||
|
||
# Becasue the described class is an abstract class, we want to verify its public interface. | ||
it { is_expected.to be_a(Enumerable) } | ||
end |