Adding Base splitting strategy

Begin moving IIIF Print splitting strategies into the code base.
notch8 · Mar 27, 2023 · 5f1ca02 · 5f1ca02
1 parent 9751ce6
commit 5f1ca02
Show file tree

Hide file tree

Showing 8 changed files with 171 additions and 1 deletion.
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -2,6 +2,7 @@ PATH
   remote: .
   specs:
     space_stone-pdf_splitter (0.1.0)
+      activesupport (>= 5)
       mini_magick
 
 GEM

diff --git a/lib/space_stone/pdf_splitter.rb b/lib/space_stone/pdf_splitter.rb
@@ -2,6 +2,7 @@
 
 require_relative 'pdf_splitter/version'
 require 'space_stone/pdf_splitter/pdf_pages_summary'
+require 'space_stone/pdf_splitter/strategies'
 
 module SpaceStone
   module PdfSplitter

diff --git a/lib/space_stone/pdf_splitter/pdf_pages_summary.rb b/lib/space_stone/pdf_splitter/pdf_pages_summary.rb
@@ -10,11 +10,23 @@ module PdfSplitter
       :height, :pixels_per_inch, :color_description,
       :channels, :bits, keyword_init: true
     ) do
-      # return [Array<String, Integer, Integer>]
+      # @return [Array<String, Integer, Integer>]
       def color
         [color_description, channels, bits]
       end
       alias_method :ppi, :pixels_per_inch
+
+      # If the underlying extraction couldn't set the various properties, we likely have an
+      # invalid_pdf.
+      def valid?
+        return false if pdf_pages_summary.color_description.nil?
+        return false if pdf_pages_summary.channels.nil?
+        return false if pdf_pages_summary.bits.nil?
+        return false if pdf_pages_summary.height.nil?
+        return false if pdf_pages_summary.page_count.to_i.zero?
+
+        true
+      end
     end
 
     # I want to ensure the struct is created first so that I don't have collisions on name space.

diff --git a/lib/space_stone/pdf_splitter/strategies.rb b/lib/space_stone/pdf_splitter/strategies.rb
@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+
+module SpaceStone
+  module PdfSplitter
+    # Namespace for declaring strategies for splitting PDFs.
+    module Strategies
+    end
+  end
+end
+
+require 'space_stone/pdf_splitter/strategies/base'
diff --git a/lib/space_stone/pdf_splitter/strategies/base.rb b/lib/space_stone/pdf_splitter/strategies/base.rb
@@ -0,0 +1,134 @@
+# frozen_string_literal: true
+
+require 'open3'
+require 'securerandom'
+require 'tmpdir'
+require 'active_support/core_ext/class/attribute'
+
+module SpaceStone
+  module PdfSplitter
+    module Strategies
+      ##
+      # @abstract
+      #
+      # The purpose of this class is to split the PDF into constituent image files.
+      #
+      # @see #each
+      class Base
+        class_attribute :image_extension
+        class_attribute :default_dpi, default: 400
+        # Should we perform compression logic on the images?
+        class_attribute :compression, default: nil
+        # What is the image quality we're using?
+        class_attribute :quality, default: nil
+
+        ##
+        # @param path [String] the path to the source PDF that we're processing.
+        # @param baseid [String] used for creating a unique identifier
+        # @param tmpdir [String] place to perform the "work" of splitting the PDF.
+        #
+        # @param pdf_pages_summary [SpaceStone::PdfSplitter::PdfPagesSummary] by default we'll
+        #        extract this from the given path, but for testing purposes, you might want to
+        #        provide a specific summary.
+        def initialize(path, baseid: SecureRandom.uuid, tmpdir: Dir.mktmpdir, pdf_pages_summary: PdfPagesSummary.extract(path: path))
+          @baseid = baseid
+          @pdfpath = path
+          @pdf_pages_summary = pdf_pages_summary
+          @tmpdir = tmpdir
+        end
+
+        # In creating {#each} we get many of the methods of array operation (e.g. #to_a).
+        include Enumerable
+
+        ##
+        # @api public
+        #
+        # @yieldparam [String] the path to the page's tiff.
+        def each
+          entries.each do |e|
+            yield(e)
+          end
+        end
+
+        # @api private
+        def invalid_pdf?
+          !pdf_pages_summary.valid?
+        end
+
+        attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
+        private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
+
+        private
+
+        # entries for each page
+        def entries
+          return @entries if defined? @entries
+
+          @entries = Array.wrap(gsconvert)
+        end
+
+        # rubocop:disable Metrics/MethodLength
+        def gsconvert
+          output_base = File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
+          # NOTE: you must call gsdevice before compression, as compression is
+          # updated during the gsdevice call.
+          cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
+          cmd += " -sCompression=#{compression}" if compression?
+          cmd += " -dJPEGQ=#{quality}" if quality?
+          cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
+          filenames = []
+
+          Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
+            page_number = 0
+            stdout.read.split("\n").each do |line|
+              next unless line.start_with?('Page ')
+
+              page_number += 1
+              filenames << File.join(tmpdir, "#{baseid}-page#{page_number}.#{image_extension}")
+            end
+          end
+
+          filenames
+        end
+        # rubocop:enable Metrics/MethodLength
+
+        def gsdevice
+          raise NotImplementedError
+        end
+
+        PAGE_COUNT_REGEXP = %r{^Pages: +(\d+)$}.freeze
+
+        def pagecount
+          return @pagecount if defined? @pagecount
+
+          cmd = "pdfinfo #{pdfpath}"
+          Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
+            match = PAGE_COUNT_REGEXP.match(stdout.read)
+            @pagecount = match[1].to_i
+          end
+          @pagecount
+        end
+
+        def ppi
+          if looks_scanned?
+            # For scanned media, defer to detected image PPI:
+            pdf_pages_summary.ppi
+          else
+            # 400 dpi for something that does not look like scanned media:
+            default_dpi
+          end
+        end
+
+        def looks_scanned?
+          max_image_px = pdf_pages_summary.width * pdf_pages_summary.height
+          # single 10mp+ image per page?
+          single_image_per_page? && max_image_px > 1024 * 1024 * 10
+        end
+
+        def single_image_per_page?
+          pdf_pages_summary.page_count == pagecount
+        end
+      end
+    end
+  end
+end
diff --git a/space_stone-pdf_splitter.gemspec b/space_stone-pdf_splitter.gemspec
@@ -34,6 +34,7 @@ Gem::Specification.new do |spec|
   # guide at: https://bundler.io/guides/creating_gem.html
 
   spec.add_dependency 'mini_magick'
+  spec.add_dependency 'activesupport', ">= 5"
   spec.add_development_dependency 'bixby'
   spec.add_development_dependency 'rspec'
 end
diff --git a/spec/space_stone/pdf_splitter/pdf_pages_summary_spec.rb b/spec/space_stone/pdf_splitter/pdf_pages_summary_spec.rb
@@ -20,6 +20,7 @@
                         pixels_per_inch: 4, color_description: 'rgb', channels: 5, bits: 6)
   end
 
+  it { is_expected.to respond_to(:valid?) }
   it { is_expected.to respond_to(:path) }
   it { is_expected.to respond_to(:page_count) }
   it { is_expected.to respond_to(:width) }

diff --git a/spec/space_stone/pdf_splitter/strategies/base_spec.rb b/spec/space_stone/pdf_splitter/strategies/base_spec.rb
@@ -0,0 +1,9 @@
+# frozen_string_literal: true
+
+RSpec.describe SpaceStone::PdfSplitter::Strategies::Base do
+  subject { described_class.new(__FILE__, pdf_pages_summary: pdf_pages_summary) }
+  let(:pdf_pages_summary) { double(SpaceStone::PdfSplitter::PdfPagesSummary) }
+
+  # Becasue the described class is an abstract class, we want to verify its public interface.
+  it { is_expected.to be_a(Enumerable) }
+end