Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added server mode with tests #23

Merged
merged 2 commits into from
Dec 20, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 93 additions & 12 deletions lib/yomu.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@
require 'mime/types'
require 'json'

require 'socket'
require 'stringio'

class Yomu
GEMPATH = File.dirname(File.dirname(__FILE__))
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.6.jar')
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port

@@server_port = nil
@@server_pid = nil

# Read text or metadata from a data buffer.
#
Expand All @@ -15,6 +22,21 @@ class Yomu
# metadata = Yomu.read :metadata, data

def self.read(type, data)
result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data)

case type
when :text
result
when :html
result
when :metadata
JSON.parse(result)
when :mimetype
MIME::Types[JSON.parse(result)['Content-Type']].first
end
end

def self._client_read(type, data)
switch = case type
when :text
'-t'
Expand All @@ -25,23 +47,35 @@ def self.read(type, data)
when :mimetype
'-m -j'
end
result = IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|

IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
io.write data
io.close_write
io.read
end
end

case type
when :text
result
when :html
result
when :metadata
JSON.parse(result)
when :mimetype
MIME::Types[JSON.parse(result)['Content-Type']].first

def self._server_read(_, data)
s = TCPSocket.new('localhost', @@server_port)
file = StringIO.new(data, 'r')

while 1
chunk = file.read(65536)
break unless chunk
s.write(chunk)
end

# tell Tika that we're done sending data
s.shutdown(Socket::SHUT_WR)

resp = ''
while 1
chunk = s.recv(65536)
break if chunk.empty? || !chunk
resp << chunk
end
resp
end

# Create a new instance of Yomu with a given document.
Expand Down Expand Up @@ -137,7 +171,6 @@ def creation_date
end
end


def path?
defined? @path
end
Expand Down Expand Up @@ -180,6 +213,54 @@ def data
@data
end

# Returns pid of Tika server, started as a new spawned process.
#
# type :html, :text or :metadata
# custom_port e.g. 9293
#
# Yomu.server(:text, 9294)
#
def self.server(type, custom_port=nil)
switch = case type
when :text
'-t'
when :html
'-h'
when :metadata
'-m -j'
when :mimetype
'-m -j'
end

@@server_port = custom_port || DEFAULT_SERVER_PORT

@@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}")
sleep(2) # Give the server 2 seconds to spin up.
@@server_pid
end

# Kills server started by Yomu.server
#
# Always run this when you're done, or else Tika might run until you kill it manually
# You might try putting your extraction in a begin..rescue...ensure...end block and
# putting this method in the ensure block.
#
# Yomu.server(:text)
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
# begin
# my_texts = reports.map{|report_path| Yomu.new(report_path).text }
# rescue
# ensure
# Yomu.kill_server!
# end
def self.kill_server!
if @@server_pid
Process.kill('INT', @@server_pid)
@@server_pid = nil
@@server_port = nil
end
end

def self.java
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
end
Expand Down
30 changes: 30 additions & 0 deletions spec/yomu_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,34 @@
expect( yomu.metadata['Content-Type'] ).to eql ["application/vnd.apple.pages", "application/vnd.apple.pages"]
end
end

context 'working as server mode' do
specify '#starts and kills server' do
begin
Yomu.server(:text)
expect(Yomu.class_variable_get(:@@server_pid)).not_to be_nil
expect(Yomu.class_variable_get(:@@server_port)).not_to be_nil

s = TCPSocket.new('localhost', Yomu.class_variable_get(:@@server_port))
expect(s).to be_a TCPSocket
s.close
ensure
port = Yomu.class_variable_get(:@@server_port)
Yomu.kill_server!
sleep 2
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
end
end

specify '#runs samples through server mode' do
begin
Yomu.server(:text)
expect(Yomu.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
expect(Yomu.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
expect(Yomu.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.'
ensure
Yomu.kill_server!
end
end
end
end