-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_terms_in_notes.rb
105 lines (88 loc) · 2.85 KB
/
find_terms_in_notes.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
require 'bundler/setup'
Bundler.require
require './as_config_production.rb'
require 'csv'
require 'active_support/inflector'
# read in search terms to pass
@search_terms = IO.readlines("search_terms.txt").each { |line| line.gsub!( "\n", '' ) }
# initialize connection
@a = ArchivesSpaceApiUtility::ArchivesSpaceSession.new
# set page size for paginated search requests
@page_size = 10
# define CSV headers - we'll use this 2 different ways
@csv_headers = ['term', 'uri', 'title', 'notes']
# Use this method to perform the API request to search the notes field on a record for whatever term
def get_terms(page=1, term)
# set API endpoint
path = '/search'
# build query parameters
params = {}
params[:page] = page
params[:page_size] = @page_size
params[:q] = "notes:(\"#{term}\" OR \"#{term.pluralize}\")"
# execute request
response = @a.get(path, params)
# convert JSON to Hash
JSON.parse(response.body)
end
# process raw values from API response (a hash) to array of values for a row in the CSV
# creates resolvable links to individual records on archivesspace
# removes line breaks from strings and replaces them with ' | '
def values_to_row(hash,term)
row = @csv_headers.map do |h|
if h == 'term'
term
elsif hash[h]
if hash[h].is_a? String
hash[h].strip.gsub(/[\n\r]+/, ' | ')
else
hash[h]
end
end
end
row
end
def get_rows(term)
rows = []
# do an initial query to get the total results, and use that to determine number of pages
data = get_terms(term)
count = data['total_hits']
pages = (count.to_f / @page_size).ceil
# set page to begin iterating through all pages of results
page = 1
puts "Fetching #{count} records for '#{term}'..."
# do a request for each page and put each record in @rows
while (page <= pages) do
data = get_terms(page, term)
results = data['results']
results.each do |r|
# This is the easy version that doesn't replace line breaks
# rows << @csv_headers.map { |h| r[h] }
# This is the better version
rows << values_to_row(r, term)
end
page += 1
print '.' # to provide a little visual feedback while the script runs
end
puts # just to add a line break to the terminal output
rows
end
def generate_csv(rows)
# create a new directory for CSV files if it doesn't exist
csv_dir = './csv'
Dir.mkdir(csv_dir) if !Dir.exist?(csv_dir)
# set file name for CSV to be generated
csv_filepath = "#{csv_dir}/remediation_#{Date.today.to_s}.csv"
# use CSV.open to create a writable .csv file and write to it
CSV.open(csv_filepath, 'w', headers: @csv_headers, write_headers: true, force_quotes: true) do |csv|
rows.each do |row|
csv << row
end
end
end
@rows = []
#put it all together! iterate through each term in the text file
@search_terms.each do |term|
@rows += get_rows(term)
end
generate_csv(@rows)