diff --git a/lib/yomu.rb b/lib/yomu.rb index cdc7c2d..24807de 100644 --- a/lib/yomu.rb +++ b/lib/yomu.rb @@ -4,9 +4,16 @@ require 'mime/types' require 'json' +require 'socket' +require 'stringio' + class Yomu GEMPATH = File.dirname(File.dirname(__FILE__)) JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.6.jar') + DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port + + @@server_port = nil + @@server_pid = nil # Read text or metadata from a data buffer. # @@ -15,6 +22,21 @@ class Yomu # metadata = Yomu.read :metadata, data def self.read(type, data) + result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data) + + case type + when :text + result + when :html + result + when :metadata + JSON.parse(result) + when :mimetype + MIME::Types[JSON.parse(result)['Content-Type']].first + end + end + + def self._client_read(type, data) switch = case type when :text '-t' @@ -25,23 +47,35 @@ def self.read(type, data) when :mimetype '-m -j' end - - result = IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io| + + IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io| io.write data io.close_write io.read end + end - case type - when :text - result - when :html - result - when :metadata - JSON.parse(result) - when :mimetype - MIME::Types[JSON.parse(result)['Content-Type']].first + + def self._server_read(_, data) + s = TCPSocket.new('localhost', @@server_port) + file = StringIO.new(data, 'r') + + while 1 + chunk = file.read(65536) + break unless chunk + s.write(chunk) end + + # tell Tika that we're done sending data + s.shutdown(Socket::SHUT_WR) + + resp = '' + while 1 + chunk = s.recv(65536) + break if chunk.empty? || !chunk + resp << chunk + end + resp end # Create a new instance of Yomu with a given document. @@ -137,7 +171,6 @@ def creation_date end end - def path? defined? @path end @@ -180,6 +213,54 @@ def data @data end + # Returns pid of Tika server, started as a new spawned process. + # + # type :html, :text or :metadata + # custom_port e.g. 9293 + # + # Yomu.server(:text, 9294) + # + def self.server(type, custom_port=nil) + switch = case type + when :text + '-t' + when :html + '-h' + when :metadata + '-m -j' + when :mimetype + '-m -j' + end + + @@server_port = custom_port || DEFAULT_SERVER_PORT + + @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}") + sleep(2) # Give the server 2 seconds to spin up. + @@server_pid + end + + # Kills server started by Yomu.server + # + # Always run this when you're done, or else Tika might run until you kill it manually + # You might try putting your extraction in a begin..rescue...ensure...end block and + # putting this method in the ensure block. + # + # Yomu.server(:text) + # reports = ["report1.docx", "report2.doc", "report3.pdf"] + # begin + # my_texts = reports.map{|report_path| Yomu.new(report_path).text } + # rescue + # ensure + # Yomu.kill_server! + # end + def self.kill_server! + if @@server_pid + Process.kill('INT', @@server_pid) + @@server_pid = nil + @@server_port = nil + end + end + def self.java ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java' end diff --git a/spec/yomu_spec.rb b/spec/yomu_spec.rb index c988042..4891392 100644 --- a/spec/yomu_spec.rb +++ b/spec/yomu_spec.rb @@ -149,4 +149,34 @@ expect( yomu.metadata['Content-Type'] ).to eql ["application/vnd.apple.pages", "application/vnd.apple.pages"] end end + + context 'working as server mode' do + specify '#starts and kills server' do + begin + Yomu.server(:text) + expect(Yomu.class_variable_get(:@@server_pid)).not_to be_nil + expect(Yomu.class_variable_get(:@@server_port)).not_to be_nil + + s = TCPSocket.new('localhost', Yomu.class_variable_get(:@@server_port)) + expect(s).to be_a TCPSocket + s.close + ensure + port = Yomu.class_variable_get(:@@server_port) + Yomu.kill_server! + sleep 2 + expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED + end + end + + specify '#runs samples through server mode' do + begin + Yomu.server(:text) + expect(Yomu.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.' + expect(Yomu.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.' + expect(Yomu.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.' + ensure + Yomu.kill_server! + end + end + end end