From 85745f24fa5700efa5d6e1a06e0beff6deccae3c Mon Sep 17 00:00:00 2001 From: "Jeremy B. Merrill" Date: Fri, 15 Aug 2014 17:11:31 -0400 Subject: [PATCH] adds server mode --- lib/yomu.rb | 101 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 89 insertions(+), 12 deletions(-) diff --git a/lib/yomu.rb b/lib/yomu.rb index 3559690..8353d06 100644 --- a/lib/yomu.rb +++ b/lib/yomu.rb @@ -4,9 +4,16 @@ require 'mime/types' require 'json' +require 'socket' +require 'stringio' + class Yomu GEMPATH = File.dirname(File.dirname(__FILE__)) JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.5.jar') + DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port + + @@server_port = nil + @@server_pid = nil # Read text or metadata from a data buffer. # @@ -15,6 +22,21 @@ class Yomu # metadata = Yomu.read :metadata, data def self.read(type, data) + result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data) + + case type + when :text + result + when :html + result + when :metadata + JSON.parse(result) + when :mimetype + MIME::Types[JSON.parse(result)['Content-Type']].first + end + end + + def self._client_read(type, data) switch = case type when :text '-t' @@ -25,23 +47,35 @@ def self.read(type, data) when :mimetype '-m -j' end - - result = IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io| + + IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io| io.write data io.close_write io.read end + end - case type - when :text - result - when :html - result - when :metadata - JSON.parse(result) - when :mimetype - MIME::Types[JSON.parse(result)['Content-Type']].first + + def self._server_read(_, data) + s = TCPSocket.new('localhost', @@server_port) + file = StringIO.new(data, 'r') + + while 1 + chunk = file.read(65536) + break unless chunk + s.write(chunk) end + + # tell Tika that we're done sending data + s.shutdown(Socket::SHUT_WR) + + resp = '' + while 1 + chunk = s.recv(65536) + break if chunk.empty? || !chunk + resp << chunk + end + resp end # Create a new instance of Yomu with a given document. @@ -137,7 +171,6 @@ def creation_date end end - def path? defined? @path end @@ -180,6 +213,50 @@ def data @data end + # Returns pid of Tika server, started as a new spawned process. + # + # type :html, :text or :metadata + # custom_port e.g. 9293 + # + # Yomu.server(:text, 9294) + # + def self.server(type, custom_port=nil) + switch = case type + when :text + '-t' + when :html + '-h' + when :metadata + '-m -j' + when :mimetype + '-m -j' + end + + @@server_port = custom_port || DEFAULT_SERVER_PORT + + @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}") + sleep(2) # Give the server 2 seconds to spin up. + @@server_pid + end + + # Kills server started by Yomu.server + # + # Always run this when you're done, or else Tika might run until you kill it manually + # You might try putting your extraction in a begin..rescue...ensure...end block and + # putting this method in the ensure block. + # + # Yomu.server(:text) + # reports = ["report1.docx", "report2.doc", "report3.pdf"] + # begin + # my_texts = reports.map{|report_path| Yomu.new(report_path).text } + # rescue + # ensure + # Yomu.kill_server! + # end + def self.kill_server! + Process.kill('INT', @@server_pid) if @@server_pid + end + def self.java ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java' end