From 98ac387fedc25b1cbfbbb214fda8d716b3920ff4 Mon Sep 17 00:00:00 2001 From: Mark Reyes Date: Thu, 7 Jul 2022 15:21:24 -0700 Subject: [PATCH 1/2] https://github.com/CDLUC3/mrt-doc/issues/1053 --- lib/mrt/ingest/component.rb | 26 +---- lib/mrt/ingest/iobject.rb | 31 ++--- lib/mrt/ingest/one_time_server.rb | 111 ------------------ mrt-ingest.gemspec | 2 +- spec/unit/mrt/ingest/component_spec.rb | 2 - spec/unit/mrt/ingest/iobject_spec.rb | 11 +- spec/unit/mrt/ingest/one_time_server_spec.rb | 113 ------------------- 7 files changed, 14 insertions(+), 282 deletions(-) delete mode 100644 lib/mrt/ingest/one_time_server.rb delete mode 100644 spec/unit/mrt/ingest/one_time_server_spec.rb diff --git a/lib/mrt/ingest/component.rb b/lib/mrt/ingest/component.rb index 95b241c..1828cf1 100644 --- a/lib/mrt/ingest/component.rb +++ b/lib/mrt/ingest/component.rb @@ -6,10 +6,9 @@ module Ingest # #File. class Component # :nodoc: - attr_reader :server, :uri + attr_reader :uri - def initialize(server, location, options) - @server = server + def initialize(location, options) @name = options[:name] @digest = options[:digest] @mime_type = options[:mime_type] @@ -18,25 +17,6 @@ def initialize(server, location, options) init_uri(location) end - class << self - def from_erc(server, erc) - return Component.new(server, erc, name: 'mrt-erc.txt') if erc.is_a?(URI) || erc.is_a?(File) - return from_hash(server, erc) if erc.is_a?(Hash) - - raise ArgumentError, 'Bad ERC supplied: must be a URI, File, or Hash' - end - - def from_hash(server, erc_h) - uri_str, path = server.add_file do |f| - f.write("erc:\n") - erc_h.each_pair { |k, v| f.write("#{k}: #{v}\n") } - end - - digest = Mrt::Ingest::MessageDigest::MD5.from_file(File.new(path)) - Component.new(server, URI.parse(uri_str), name: 'mrt-erc.txt', digest: digest) - end - end - def to_manifest_entry "#{@uri} | #{digest_type} | #{digest_value} | #{@size} | | #{@name} | #{@mime_type}\n" end @@ -64,7 +44,7 @@ def init_uri(location) def init_from_file(file) @name = File.basename(file.path) if @name.nil? - @uri = server.add_file(file)[0] + # @uri = server.add_file(file)[0] @digest = Mrt::Ingest::MessageDigest::MD5.from_file(file) if @digest.nil? @size = File.size(file.path) if @size.nil? end diff --git a/lib/mrt/ingest/iobject.rb b/lib/mrt/ingest/iobject.rb index b4ba9af..f773db8 100644 --- a/lib/mrt/ingest/iobject.rb +++ b/lib/mrt/ingest/iobject.rb @@ -13,17 +13,15 @@ module Ingest class IObject attr_accessor :primary_identifier, :local_identifier, :erc - attr_reader :server # Options can have the keys :primary_identifier, - # :local_identifier, :server, or :erc. :erc can be a #File, #Uri - # or a #Hash of metadata. :server is a #OneTimeServer. + # :local_identifier, or :erc. :erc can be a #File, #Uri + # or a #Hash of metadata. def initialize(options = {}) @primary_identifier = options[:primary_identifier] @local_identifier = options[:local_identifier] @erc = options[:erc] || {} @components = [] - @server = options[:server] || Mrt::Ingest::OneTimeServer.new end # Add a component to the object. where can be either a #URI or a @@ -32,33 +30,20 @@ def initialize(options = {}) # subclass of Mrt::Ingest::MessageDigest::Base. If where is a # #File, it will be hosted on an embedded web server. def add_component(where, options = {}) - @components.push(Component.new(@server, where, options)) + @components.push(Component.new(where, options)) end # Make a Mrt::Ingest::Request object for this mrt-object def mk_request(profile, user_agent) manifest_file = Tempfile.new('mrt-ingest') - erc_component = Component.from_erc(@server, @erc) - mk_manifest(manifest_file, erc_component) + mk_manifest(manifest_file) # reset to beginning manifest_file.open new_request(manifest_file, profile, user_agent) end - def start_server # :nodoc: - @server.start_server - end - - def join_server # :nodoc: - @server.join_server - end - - def stop_server # :nodoc: - @server.stop_server - end - # rubocop:disable Metrics/LineLength - def mk_manifest(manifest, erc_component) # :nodoc: + def mk_manifest(manifest) # :nodoc: manifest.write("#%checkm_0.7\n") manifest.write("#%profile http://uc3.cdlib.org/registry/ingest/manifest/mrt-ingest-manifest\n") manifest.write("#%prefix | mrt: | http://uc3.cdlib.org/ontology/mom#\n") @@ -67,7 +52,6 @@ def mk_manifest(manifest, erc_component) # :nodoc: @components.each do |c| manifest.write(c.to_manifest_entry) end - manifest.write(erc_component.to_manifest_entry) manifest.write("#%EOF\n") end # rubocop:enable Metrics/LineLength @@ -76,7 +60,6 @@ def mk_manifest(manifest, erc_component) # :nodoc: # submitter. def start_ingest(client, profile, submitter) request = mk_request(profile, submitter) - start_server @response = client.ingest(request) end @@ -84,7 +67,6 @@ def start_ingest(client, profile, submitter) def finish_ingest # XXX Right now we only join the hosting server; in the future # we will check the status via the ingest server. - join_server end private @@ -96,6 +78,9 @@ def new_request(manifest_file, profile, user_agent) type: 'object-manifest', submitter: user_agent, profile: profile, + title: @erc[:what], + creator: @erc[:who], + date: @erc[:when], local_identifier: @local_identifier, primary_identifier: @primary_identifier ) diff --git a/lib/mrt/ingest/one_time_server.rb b/lib/mrt/ingest/one_time_server.rb deleted file mode 100644 index 2bbfe59..0000000 --- a/lib/mrt/ingest/one_time_server.rb +++ /dev/null @@ -1,111 +0,0 @@ -# Author:: Erik Hetzner (mailto:erik.hetzner@ucop.edu) -# Copyright:: Copyright (c) 2011, Regents of the University of California - -require 'webrick' - -# An HTTP server that will serve each file ONCE before shutting down. -module Mrt - module Ingest - class OneTimeServer - - attr_reader :dir, :port - - # Find an open port, starting with start and adding one until we get - # an open port - def get_open_port(start = 8081) - try_port = start - loop do - begin - s = TCPServer.open(try_port) - s.close - return try_port - rescue Errno::EADDRINUSE - try_port += 1 - end - end - end - - def initialize - @dir = Dir.mktmpdir - @mutex = Mutex.new - @known_paths = {} - @requested = {} - @port = get_open_port - @file_callback = ->(req, _res) { @requested[req.path] ||= true } - @server = WEBrick::HTTPServer.new(Port: @port) - @server.mount('/', WEBrick::HTTPServlet::FileHandler, @dir, FileCallback: @file_callback) - end - - # Return true if each file has been served. - def finished? - Dir.entries(@dir).each do |entry| - next if %w[. ..].include?(entry) - return false if @requested["/#{entry}"].nil? - end - true - end - - def temppath - tmpfile = Tempfile.new('tmp', @dir) - tmppath = tmpfile.path - tmpfile.close! - @mutex.synchronize do - unless @known_paths.key?(tmppath) - # no collision - @known_paths[tmppath] = true - return tmppath - end - end - # need to retry, there was a collision - temppath - end - - # Add a file to this server. Returns the URL to use - # to fetch the file & the file path - def add_file(sourcefile = nil) - fullpath = temppath - path = File.basename(fullpath) - - if sourcefile - @server.mount("/#{path}", WEBrick::HTTPServlet::FileHandler, sourcefile.path, FileCallback: @file_callback) - else - File.open(fullpath, 'w+') { |f| yield f } - end - ["http://#{Socket.gethostname}:#{@port}/#{path}", fullpath] - end - - def start_server - if @thread.nil? - @thread = Thread.new do - @server.start - end - end - sleep(0.1) while @server.status != :Running - @thread - end - - # Stop server unconditionally. - def stop_server - @server.shutdown - @thread.join - end - - # Wait for server to finish serving all files. - def join_server - # ensure that each file is requested once before shutting down - sleep(1) until finished? - @server.shutdown - @thread.join - end - - # Run the server and wait until each file has been served once. - # Cleans up files before it returns. - def run - start_server - join_server - # FileUtils.rm_rf(@dir) - nil - end - end - end -end diff --git a/mrt-ingest.gemspec b/mrt-ingest.gemspec index ed0ea6f..83e2b51 100644 --- a/mrt-ingest.gemspec +++ b/mrt-ingest.gemspec @@ -3,7 +3,7 @@ $LOAD_PATH.push File.expand_path('lib', __dir__) Gem::Specification.new do |s| s.required_ruby_version = '>= 2.4.0' s.name = 'mrt-ingest' - s.version = '0.0.6' + s.version = '0.0.7' s.platform = Gem::Platform::RUBY s.authors = ['Mark Reyes', 'David Moles'] s.email = ['mark.reyes@ucop.edu', 'david.moles@ucop.edu'] diff --git a/spec/unit/mrt/ingest/component_spec.rb b/spec/unit/mrt/ingest/component_spec.rb index b140245..534e8f1 100644 --- a/spec/unit/mrt/ingest/component_spec.rb +++ b/spec/unit/mrt/ingest/component_spec.rb @@ -4,8 +4,6 @@ module Mrt::Ingest describe Component do describe :from_erc do it 'rejects string ERCs' do - server = instance_double(OneTimeServer) - expect { Component.from_erc(server, 'I am not an ERC') }.to raise_error(ArgumentError) end end end diff --git a/spec/unit/mrt/ingest/iobject_spec.rb b/spec/unit/mrt/ingest/iobject_spec.rb index fdd3831..a8c690a 100644 --- a/spec/unit/mrt/ingest/iobject_spec.rb +++ b/spec/unit/mrt/ingest/iobject_spec.rb @@ -81,11 +81,9 @@ def parse_erc_entry(erc_entry) def check_erc_content(iobject, asserted_erc) erc_entry = get_uri_for_name(iobject, 'mrt-erc.txt') expect(erc_entry).not_to be_nil - iobject.start_server begin expect(parse_erc_entry(erc_entry)).to eq(asserted_erc) ensure - iobject.stop_server end end @@ -137,11 +135,9 @@ def check_erc_content(iobject, asserted_erc) it 'should serve a valid mrt-erc.txt entry' do expect(@erc_entry).not_to be_nil - @iobject.start_server begin open(@erc_entry.values[0]).read.lines.to_a ensure - @iobject.stop_server end end @@ -186,11 +182,9 @@ def check_erc_content(iobject, asserted_erc) manifest = parse_object_manifest(iobject) expect(manifest).not_to(be_nil) expect(uri_entry).not_to be_nil - iobject.start_server begin expect(open(uri_entry.values[0]).read).to eq(FILE_CONTENT) ensure - iobject.stop_server end end end @@ -221,9 +215,8 @@ def check_erc_content(iobject, asserted_erc) @iobject.start_ingest(@client, 'example_profile', 'Atom processor/Example collection') # TODO: just mock the server - server = @iobject.server - files = Dir.entries(server.dir).reject { |e| %w[. ..].include?(e) } - urls = files.map { |f| "http://#{Socket.gethostname}:#{server.port}/#{f}" } + # files = Dir.entries(server.dir).reject { |e| %w[. ..].include?(e) } + # urls = files.map { |f| "http://#{Socket.gethostname}:#{server.port}/#{f}" } client_process_id = fork do begin diff --git a/spec/unit/mrt/ingest/one_time_server_spec.rb b/spec/unit/mrt/ingest/one_time_server_spec.rb deleted file mode 100644 index 1b37c4b..0000000 --- a/spec/unit/mrt/ingest/one_time_server_spec.rb +++ /dev/null @@ -1,113 +0,0 @@ -require 'spec_helper' -require 'English' - -module Mrt::Ingest - describe OneTimeServer do - attr_reader :server - - before(:each) do - @server = OneTimeServer.new - server.start_server - end - - after(:each) do - server.stop_server - end - - describe :finished? do - it 'returns true when all files have been served, false otherwise' do - urls = (0..3).map do |i| - url_str, = server.add_file { |f| f.puts("I am file #{i}") } - url_str - end - - urls.each do |url| - expect(server.finished?).to be_falsey - Net::HTTP.get(URI.parse(url)) - end - - expect(server.finished?).to be_truthy - end - end - - describe :temppath do - it 'avoids collisions' do - tmpfiles = [] - allow(Tempfile).to receive(:new).and_wrap_original do |m, *args| - tmpfile = m.call(*args) - if tmpfiles.empty? - known_paths = server.instance_variable_get(:@known_paths) - known_paths[tmpfile.path] = true - end - tmpfiles << tmpfile.path - tmpfile - end - - temppath = server.temppath - expect(tmpfiles.size).to eq(2) - expect(temppath).to eq(tmpfiles[1]) - end - end - - describe :join_server do - it 'blocks till all files have been served' do - urls = (0..3).map do |i| - url_str, = server.add_file { |f| f.puts("I am file #{i}") } - url_str - end - - joining_thread = Thread.new { server.join_server } - expect(joining_thread.status).not_to be_falsey - - client_process_id = fork do - begin - urls.each do |url| - resp = Net::HTTP.get_response(URI.parse(url)) - status = resp.code.to_i - exit(status) if status != 200 - end - rescue StandardError => e - warn(e) - exit(1) - end - end - Process.wait(client_process_id) - expect($CHILD_STATUS.exitstatus).to eq(0) # just to be sure - - Timeout.timeout(5) { joining_thread.join } - expect(joining_thread.status).to eq(false) - end - end - - describe :run do - it 'starts, serves, and stops' do - server2 = OneTimeServer.new - urls = (0..3).map do |i| - url_str, = server2.add_file { |f| f.puts("I am file #{i}") } - url_str - end - - running_thread = Thread.new { server2.run } - expect(running_thread.status).not_to be_falsey - - client_process_id = fork do - begin - urls.each do |url| - resp = Net::HTTP.get_response(URI.parse(url)) - status = resp.code.to_i - exit(status) if status != 200 - end - rescue StandardError => e - warn(e) - exit(1) - end - end - Process.wait(client_process_id) - expect($CHILD_STATUS.exitstatus).to eq(0) # just to be sure - - Timeout.timeout(5) { running_thread.join } - expect(running_thread.status).to eq(false) - end - end - end -end From 063dc8bf539babf9709bbadaf2445b528273677f Mon Sep 17 00:00:00 2001 From: Mark Reyes Date: Tue, 12 Jul 2022 15:35:02 -0700 Subject: [PATCH 2/2] Fix DC metadata dereference --- lib/mrt/ingest/iobject.rb | 8 ++++---- mrt-ingest.gemspec | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/mrt/ingest/iobject.rb b/lib/mrt/ingest/iobject.rb index f773db8..634e583 100644 --- a/lib/mrt/ingest/iobject.rb +++ b/lib/mrt/ingest/iobject.rb @@ -12,7 +12,7 @@ module Ingest # An object prepared for ingest into Merritt. class IObject - attr_accessor :primary_identifier, :local_identifier, :erc + attr_accessor :primary_identifier, :local_identifier, :erc, :what, :who, :when # Options can have the keys :primary_identifier, # :local_identifier, or :erc. :erc can be a #File, #Uri @@ -78,9 +78,9 @@ def new_request(manifest_file, profile, user_agent) type: 'object-manifest', submitter: user_agent, profile: profile, - title: @erc[:what], - creator: @erc[:who], - date: @erc[:when], + title: @erc['what'], + creator: @erc['who'], + date: @erc['when'], local_identifier: @local_identifier, primary_identifier: @primary_identifier ) diff --git a/mrt-ingest.gemspec b/mrt-ingest.gemspec index 83e2b51..e1d5cb5 100644 --- a/mrt-ingest.gemspec +++ b/mrt-ingest.gemspec @@ -3,7 +3,7 @@ $LOAD_PATH.push File.expand_path('lib', __dir__) Gem::Specification.new do |s| s.required_ruby_version = '>= 2.4.0' s.name = 'mrt-ingest' - s.version = '0.0.7' + s.version = '0.0.10' s.platform = Gem::Platform::RUBY s.authors = ['Mark Reyes', 'David Moles'] s.email = ['mark.reyes@ucop.edu', 'david.moles@ucop.edu'] @@ -12,10 +12,11 @@ Gem::Specification.new do |s| s.description = 'A client for the Merritt ingest system. More details available from https://github.com/CDLUC3/mrt-doc/wiki' s.license = 'BSD-3-Clause' - s.add_dependency 'json', '~> 2.0' - s.add_dependency 'rest-client', '~> 2.0' + # s.add_dependency 'json', '~> 2.1' + s.add_dependency 'rest-client', '~> 2.1' - s.add_development_dependency 'bundler', '>= 2.2.10' + + # s.add_development_dependency 'bundler', '>= 2.2.10' s.add_development_dependency 'checkm', '0.0.6' s.add_development_dependency 'mocha', '~> 1.7' s.add_development_dependency 'rake', '~> 12.0'