diff --git a/app/models/detector/suggested_resource.rb b/app/models/detector/suggested_resource.rb index 526074e..b927fe2 100644 --- a/app/models/detector/suggested_resource.rb +++ b/app/models/detector/suggested_resource.rb @@ -53,5 +53,23 @@ def calculate_fingerprint(old_phrase) # Rejoin tokens tokens.join(' ') end + + # This accepts an array of values and saves them all as new records. It is + # called by the suggested_resources:reload rake task. + def self.bulk_replace(input) + raise ArgumentError.new, 'Tabular CSV is required' unless input.instance_of?(CSV::Table) + + # Need to check what columns exist in input + required_headers = %i[title url phrase] + missing_headers = required_headers - input.headers + raise ArgumentError.new, "Some CSV columns missing: #{missing_headers}" unless missing_headers.empty? + + Detector::SuggestedResource.delete_all + + input.each do |line| + record = Detector::SuggestedResource.new({ title: line[:title], url: line[:url], phrase: line[:phrase] }) + record.save + end + end end end diff --git a/lib/tasks/suggested_resources.rake b/lib/tasks/suggested_resources.rake index fbab52a..2fe87da 100644 --- a/lib/tasks/suggested_resources.rake +++ b/lib/tasks/suggested_resources.rake @@ -12,26 +12,34 @@ namespace :suggested_resources do Rails.logger.info("Record count before we reload: #{Detector::SuggestedResource.count}") if URI(args.addr).scheme - Rails.logger.info("Loading from remote address: #{args.addr}") url = URI.parse(args.addr) raise ArgumentError.new, 'HTTP/HTTPS scheme is required' unless url.scheme.in?(%w[http https]) - Rails.logger.info(url) - file = url.read - Rails.logger.info(file) - # Need to connect to a CSV content type - # Invalid parsing should... do something? + data = csv_table_from_url_direct(url) else - Rails.logger.info("Loading from local file: #{args.addr}") - file = File.read(args.addr) - Rails.logger.info(file) + file = File.open(args.addr) # Invalid / not found file should ... do something? + data = CSV.table(file) end - Rails.logger.info('Now ready to parse a CSV') - data = CSV.parse(file) - Rails.logger.info(data) + Detector::SuggestedResource.bulk_replace(data) - # Rails.logger.info("Record count after we reload: #{Detector::SuggestedResource.count}") + Rails.logger.info("Record count after we reload: #{Detector::SuggestedResource.count}") + end + + def csv_table_from_url_direct(url) + file = url.open.read.force_encoding('UTF-8').encode + csv = CSV.parse(file, headers: true) + end + + def csv_table_from_url_rebuild(url) + file = url.read + all_rows = CSV.new(file).read + header = [] + all_rows[0].each { |field| header.push(field.strip.downcase.gsub("\xEF\xBB\xBF".force_encoding("UTF-8"), '').to_sym) } + value_rows = all_rows.length - 1 + values = all_rows[1..value_rows] + rebuild = values.map { |row| CSV::Row.new(header, row) } + CSV::Table.new(rebuild) end end diff --git a/test/fixtures/files/suggested_resources.csv b/test/fixtures/files/suggested_resources.csv new file mode 100644 index 0000000..f0bec40 --- /dev/null +++ b/test/fixtures/files/suggested_resources.csv @@ -0,0 +1,3 @@ +Title,URL,Phrase +New Example,https://example.org,new example search +Web of Science,https://libraries.mit.edu/webofsci,web of Science \ No newline at end of file diff --git a/test/fixtures/files/suggested_resources.xlsx b/test/fixtures/files/suggested_resources.xlsx new file mode 100644 index 0000000..819974a Binary files /dev/null and b/test/fixtures/files/suggested_resources.xlsx differ diff --git a/test/fixtures/files/suggested_resources_extra.csv b/test/fixtures/files/suggested_resources_extra.csv new file mode 100644 index 0000000..adeb0c3 --- /dev/null +++ b/test/fixtures/files/suggested_resources_extra.csv @@ -0,0 +1,3 @@ +Title,URL,Phrase,Extra +Example,https://example.org,example search,extra 1 +Web of Science,https://libraries.mit.edu/webofsci,web of Science,extra 2 \ No newline at end of file diff --git a/test/fixtures/files/suggested_resources_missing_field.csv b/test/fixtures/files/suggested_resources_missing_field.csv new file mode 100644 index 0000000..9c478b3 --- /dev/null +++ b/test/fixtures/files/suggested_resources_missing_field.csv @@ -0,0 +1,3 @@ +Title,URL +Example,https://example.org +Web of Science,https://libraries.mit.edu/webofsci \ No newline at end of file diff --git a/test/fixtures/files/suggested_resources_wrong_columns.csv b/test/fixtures/files/suggested_resources_wrong_columns.csv new file mode 100644 index 0000000..1ddcdfa --- /dev/null +++ b/test/fixtures/files/suggested_resources_wrong_columns.csv @@ -0,0 +1,2 @@ +Title,URL +Example,https://example.org \ No newline at end of file diff --git a/test/tasks/suggested_resource_rake_test.rb b/test/tasks/suggested_resource_rake_test.rb new file mode 100644 index 0000000..8a0f034 --- /dev/null +++ b/test/tasks/suggested_resource_rake_test.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +require 'test_helper' +require 'rake' + +class SuggestedResourceRakeTest < ActiveSupport::TestCase + def setup + Tacos::Application.load_tasks if Rake::Task.tasks.empty? + Rake::Task['suggested_resources:reload'].reenable + end + + test 'invoked reload can accept a local file' do + records_before = Detector::SuggestedResource.count # We have three fixtures at the moment + first_record_before = Detector::SuggestedResource.first + local_file = Rails.root.join('test','fixtures','files','suggested_resources.csv').to_s + Rake::Task["suggested_resources:reload"].invoke(local_file) + refute_equal records_before, Detector::SuggestedResource.count + refute_equal first_record_before, Detector::SuggestedResource.first + end + + test 'reload task errors without a file argument' do + assert_raises(ArgumentError) { + Rake::Task['suggested_resources:reload'].invoke + } + end + + test 'reload can accept a url' do + VCR.use_cassette('remote csv') do + remote_file = 'http://static.lndo.site/suggested_resources.csv' + Rake::Task["suggested_resources:reload"].invoke(remote_file) + end + end + + test 'reload fails with a non-CSV file' do + local_file = Rails.root.join('test','fixtures','files','suggested_resources.xlsx').to_s + assert_raises(CSV::MalformedCSVError) { + Rake::Task['suggested_resources:reload'].invoke(local_file) + } + end + + test 'reload fails unless all three columns are present: title, url, phrase' do + local_file = Rails.root.join('test','fixtures','files','suggested_resources_missing_field.csv').to_s + error = assert_raises(ArgumentError) { + Rake::Task['suggested_resources:reload'].invoke(local_file) + } + assert_equal 'Some CSV columns missing: [:phrase]', error.message + end + + test 'reload succeeds if extra columns are present' do + local_file = Rails.root.join('test','fixtures','files','suggested_resources_extra.csv').to_s + Rake::Task['suggested_resources:reload'].invoke(local_file) + end +end diff --git a/test/vcr_cassettes/remote_csv.yml b/test/vcr_cassettes/remote_csv.yml new file mode 100644 index 0000000..9696462 --- /dev/null +++ b/test/vcr_cassettes/remote_csv.yml @@ -0,0 +1,40 @@ +--- +http_interactions: +- request: + method: get + uri: http://static.lndo.site/suggested_resources.csv + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Accept-Ranges: + - bytes + Content-Length: + - '129' + Content-Type: + - text/csv + Date: + - Mon, 05 Aug 2024 19:32:16 GMT + Etag: + - '"81-61ef4b40715fb"' + Last-Modified: + - Mon, 05 Aug 2024 19:30:01 GMT + Server: + - Apache/2.4.54 (Debian) + body: + encoding: ASCII-8BIT + string: !binary |- + 77u/VGl0bGUsVVJMLFBocmFzZQ0KRXhhbXBsZSxodHRwczovL2V4YW1wbGUub3JnLGV4YW1wbGUgc2VhcmNoDQpXZWIgb2YgU2NpZW5jZSxodHRwczovL2xpYnJhcmllcy5taXQuZWR1L3dlYm9mc2NpLHdlYiBvZiBTY2llbmNl + recorded_at: Mon, 05 Aug 2024 19:32:16 GMT +recorded_with: VCR 6.2.0