Skip to content
This repository has been archived by the owner on May 3, 2023. It is now read-only.

Commit

Permalink
Adding Base splitting strategy
Browse files Browse the repository at this point in the history
Begin moving IIIF Print splitting strategies into the code base.
  • Loading branch information
jeremyf committed Mar 27, 2023
1 parent 9751ce6 commit 5f1ca02
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 1 deletion.
1 change: 1 addition & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ PATH
remote: .
specs:
space_stone-pdf_splitter (0.1.0)
activesupport (>= 5)
mini_magick

GEM
Expand Down
1 change: 1 addition & 0 deletions lib/space_stone/pdf_splitter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require_relative 'pdf_splitter/version'
require 'space_stone/pdf_splitter/pdf_pages_summary'
require 'space_stone/pdf_splitter/strategies'

module SpaceStone
module PdfSplitter
Expand Down
14 changes: 13 additions & 1 deletion lib/space_stone/pdf_splitter/pdf_pages_summary.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,23 @@ module PdfSplitter
:height, :pixels_per_inch, :color_description,
:channels, :bits, keyword_init: true
) do
# return [Array<String, Integer, Integer>]
# @return [Array<String, Integer, Integer>]
def color
[color_description, channels, bits]
end
alias_method :ppi, :pixels_per_inch

# If the underlying extraction couldn't set the various properties, we likely have an
# invalid_pdf.
def valid?
return false if pdf_pages_summary.color_description.nil?
return false if pdf_pages_summary.channels.nil?
return false if pdf_pages_summary.bits.nil?
return false if pdf_pages_summary.height.nil?
return false if pdf_pages_summary.page_count.to_i.zero?

true
end
end

# I want to ensure the struct is created first so that I don't have collisions on name space.
Expand Down
11 changes: 11 additions & 0 deletions lib/space_stone/pdf_splitter/strategies.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# frozen_string_literal: true

module SpaceStone
module PdfSplitter
# Namespace for declaring strategies for splitting PDFs.
module Strategies
end
end
end

require 'space_stone/pdf_splitter/strategies/base'
134 changes: 134 additions & 0 deletions lib/space_stone/pdf_splitter/strategies/base.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# frozen_string_literal: true

require 'open3'
require 'securerandom'
require 'tmpdir'
require 'active_support/core_ext/class/attribute'

module SpaceStone
module PdfSplitter
module Strategies
##
# @abstract
#
# The purpose of this class is to split the PDF into constituent image files.
#
# @see #each
class Base
class_attribute :image_extension
class_attribute :default_dpi, default: 400
# Should we perform compression logic on the images?
class_attribute :compression, default: nil
# What is the image quality we're using?
class_attribute :quality, default: nil

##
# @param path [String] the path to the source PDF that we're processing.
# @param baseid [String] used for creating a unique identifier
# @param tmpdir [String] place to perform the "work" of splitting the PDF.
#
# @param pdf_pages_summary [SpaceStone::PdfSplitter::PdfPagesSummary] by default we'll
# extract this from the given path, but for testing purposes, you might want to
# provide a specific summary.
def initialize(path, baseid: SecureRandom.uuid, tmpdir: Dir.mktmpdir, pdf_pages_summary: PdfPagesSummary.extract(path: path))
@baseid = baseid
@pdfpath = path
@pdf_pages_summary = pdf_pages_summary
@tmpdir = tmpdir
end

# In creating {#each} we get many of the methods of array operation (e.g. #to_a).
include Enumerable

##
# @api public
#
# @yieldparam [String] the path to the page's tiff.
def each
entries.each do |e|
yield(e)
end
end

# @api private
def invalid_pdf?
!pdf_pages_summary.valid?
end

attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath

private

# entries for each page
def entries
return @entries if defined? @entries

@entries = Array.wrap(gsconvert)
end

# rubocop:disable Metrics/MethodLength
def gsconvert
output_base = File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
# NOTE: you must call gsdevice before compression, as compression is
# updated during the gsdevice call.
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
cmd += " -sCompression=#{compression}" if compression?
cmd += " -dJPEGQ=#{quality}" if quality?
cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
filenames = []

Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
page_number = 0
stdout.read.split("\n").each do |line|
next unless line.start_with?('Page ')

page_number += 1
filenames << File.join(tmpdir, "#{baseid}-page#{page_number}.#{image_extension}")
end
end

filenames
end
# rubocop:enable Metrics/MethodLength

def gsdevice
raise NotImplementedError
end

PAGE_COUNT_REGEXP = %r{^Pages: +(\d+)$}.freeze

def pagecount
return @pagecount if defined? @pagecount

cmd = "pdfinfo #{pdfpath}"
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
match = PAGE_COUNT_REGEXP.match(stdout.read)
@pagecount = match[1].to_i
end
@pagecount
end

def ppi
if looks_scanned?
# For scanned media, defer to detected image PPI:
pdf_pages_summary.ppi
else
# 400 dpi for something that does not look like scanned media:
default_dpi
end
end

def looks_scanned?
max_image_px = pdf_pages_summary.width * pdf_pages_summary.height
# single 10mp+ image per page?
single_image_per_page? && max_image_px > 1024 * 1024 * 10
end

def single_image_per_page?
pdf_pages_summary.page_count == pagecount
end
end
end
end
end
1 change: 1 addition & 0 deletions space_stone-pdf_splitter.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Gem::Specification.new do |spec|
# guide at: https://bundler.io/guides/creating_gem.html

spec.add_dependency 'mini_magick'
spec.add_dependency 'activesupport', ">= 5"
spec.add_development_dependency 'bixby'
spec.add_development_dependency 'rspec'
end
1 change: 1 addition & 0 deletions spec/space_stone/pdf_splitter/pdf_pages_summary_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
pixels_per_inch: 4, color_description: 'rgb', channels: 5, bits: 6)
end

it { is_expected.to respond_to(:valid?) }
it { is_expected.to respond_to(:path) }
it { is_expected.to respond_to(:page_count) }
it { is_expected.to respond_to(:width) }
Expand Down
9 changes: 9 additions & 0 deletions spec/space_stone/pdf_splitter/strategies/base_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# frozen_string_literal: true

RSpec.describe SpaceStone::PdfSplitter::Strategies::Base do
subject { described_class.new(__FILE__, pdf_pages_summary: pdf_pages_summary) }
let(:pdf_pages_summary) { double(SpaceStone::PdfSplitter::PdfPagesSummary) }

# Becasue the described class is an abstract class, we want to verify its public interface.
it { is_expected.to be_a(Enumerable) }
end

0 comments on commit 5f1ca02

Please sign in to comment.