MITLibraries · JPrevost · Dec 5, 2024 · Nov 20, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/app/models/preprocessor_primo.rb b/app/models/preprocessor_primo.rb
@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+
+# PreprocessorPrimo handles manipulating incoming data from the Primo UI into a structure that TACOS can work with
+class PreprocessorPrimo
+  # to_tacos processes raw incoming query from Primo, looks at each part to see if it is a keyword anywhere search
+  # Any portion that is not a keyword anywhere search drops the entire search from TACOS, logging
+  # as the shared Term `unhandled complex primo query` to allow us to track how frequently we are
+  # dropping terms so we can come back later to build out more complex handing if this is common enough
+  # to warrant the additional work.
+  # @param query [String] example `any,contains,this is a keyword search`
+  def self.to_tacos(query)
+    # Primo and TACOS agreed upon joiner is `;;;`
+    split_query = query.split(';;;')
+
+    if split_query.count > 1
+      Rails.logger.debug('Multipart primo query detected')
+
+      # As we are not currently handling complex queries, always set the value to something we can track frequency of
+      'unhandled complex primo query'
+    else
+      Rails.logger.debug('Simple primo query detected')
+
+      extract_keyword(query)
+    end
+  end
+
+  # keyword? confirms whether a portion of a primo query is a keyword search
+  # Note: we expect only 3 elements to this array for simple keyword searches and that arrays created from the Primo
+  # input to be collapsed so commas in the original search have been handled via the comma_handler method
+  # @param query_part_array [Array] example ['any', 'contains', 'this is a keyword search']
+  # @return [Boolean]
+  def self.keyword?(query_part_array)
+    return false unless query_part_array.count == 3
+    return false unless query_part_array[0] == 'any'
+
+    # For now, we are allowing all variants of the second portion of the primo query input
+    # The expected values are: contains, exact, begins_with, equals
+    # Uncommenting the following statement would allow us to restrict to just the default 'contains' if desireable
+    #
+    # return false unless query_part_array[1] == 'contains'
+
+    true
+  end
+
+  # extract_keyword works at the level of a single keyword query input coming from primo and
+  # returns a string with just that keyword with the operators removed
+  # @param query_part [String] example `any,contains,this is a keyword search`
+  # @return [String] the extracted keyword phrase
+  def self.extract_keyword(query_part)
+    query_part_array = query_part.split(',')
+
+    # We don't anticipate this being a normal state so we are tracking it under the Term `invalid primo query` as well
+    # as sending an exception to Sentry so we can understand the context in which this happens if it does
+    if query_part_array.count < 3
+      Sentry.capture_message('PreprocessorPrimo: Invalid Primo query during keyword extraction')
+      return 'invalid primo query'
+    end
+
+    the_keywords = join_keyword_and_drop_extra_parts(query_part_array)
+
+    return 'unhandled complex primo query' unless keyword?([query_part_array[0], query_part_array[1], the_keywords])
+
+    the_keywords
+  end
+
+  # join_keyword_and_drop_extra_parts handles the logic necessary to join searches that contain commas into a single ruby string
+  # after we separate the incoming string into an array based on commas
+  # @param query_part [String] example `['any', 'contains', 'this', 'is', 'a', 'keyword', 'search']`
+  # @return [String] example 'this,is,a,keyword,search'
+  def self.join_keyword_and_drop_extra_parts(query_part_array)
+    # For complex queries, which we are not handling yet, we'll need to determine how TACOS should handle the final
+    # element of the input which will be a boolean operator. For now, we will have stopped processing those by this
+    # point during the initial logic in `to_tacos` that splits on `;;` and returns if the result is more than one query
+    query_part_array.slice(2..).join(',')
+  end
+end
diff --git a/app/models/search_logger.rb b/app/models/search_logger.rb
@@ -6,8 +6,24 @@ class SearchLogger
   # Receives a phrase and source and creates a search event. Will find or create a term as needed.
   # @return [SearchEvent] the newly created SearchEvent
   def self.logevent(phrase, source)
-    term = Term.create_or_find_by!(phrase:)
+    term = Term.create_or_find_by!(phrase: extract_phrase(phrase, source))
     term.calculate_categorizations
     term.search_events.create!(source:)
   end
+
+  # Coordinates `phrase` extraction from incoming data from each `source`. If no `source` is matched,
+  # passes through incoming `phrase`.
+  # Note: as it may become useful to test in a production environment, we match on patterns of sources
+  # rather than exact string matches. Example: `primo`, `primo-testing`, `primo-playground` are all handled
+  # with the same case.
+  def self.extract_phrase(phrase, source)
+    case source
+    when /primo/
+      Rails.logger.debug('Primo case detected')
+      PreprocessorPrimo.to_tacos(phrase)
+    else
+      Rails.logger.debug('default case detected')
+      phrase
+    end
+  end
 end
diff --git a/test/controllers/graphql_controller_test.rb b/test/controllers/graphql_controller_test.rb
@@ -214,4 +214,30 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest
     assert_equal 'Transactional', json['data']['lookupTerm']['categories'].first['name']
     assert_in_delta 0.95, json['data']['lookupTerm']['categories'].first['confidence']
   end
+
+  test 'primo searches use the preprocessor to extract actual keywords' do
+    post '/graphql', params: { query: '{
+                                 logSearchEvent(sourceSystem: "primo-test",
+                                                searchTerm: "any,contains,Super cool search") {
+                                   phrase
+                                 }
+                               }' }
+
+    json = response.parsed_body
+
+    assert_equal 'Super cool search', json['data']['logSearchEvent']['phrase']
+  end
+
+  test 'primo searches use the preprocessor and logs complex queries to a specific term' do
+    post '/graphql', params: { query: '{
+                                 logSearchEvent(sourceSystem: "primo-test",
+                                                searchTerm: "any,contains,Super cool search;;;any,contains,uh oh this is getting complicated") {
+                                   phrase
+                                 }
+                               }' }
+
+    json = response.parsed_body
+
+    assert_equal 'unhandled complex primo query', json['data']['logSearchEvent']['phrase']
+  end
 end
diff --git a/test/models/preprocessor_primo_test.rb b/test/models/preprocessor_primo_test.rb
@@ -0,0 +1,102 @@
+# frozen_string_literal: true
+
+#
+require 'test_helper'
+
+class PreprocessorPrimoTest < ActiveSupport::TestCase
+  test 'to_tacos returns unhandled for complex queries' do
+    input = 'any,contains,space;;;any,contains,madness'
+
+    assert_equal('unhandled complex primo query', PreprocessorPrimo.to_tacos(input))
+  end
+
+  test 'to_tacos returns unhandled for targeted field queries' do
+    input = 'title,contains,space'
+
+    assert_equal('unhandled complex primo query', PreprocessorPrimo.to_tacos(input))
+  end
+
+  test 'to_tacos returns phrase ready for tacos for simple keyword input' do
+    input = 'any,contains,space'
+
+    assert_equal('space', PreprocessorPrimo.to_tacos(input))
+  end
+
+  test 'to_tacos returns phrase ready for complex keyword input' do
+    input = 'any,contains,Yan, F., Krantz, P., Sung, Y., Kjaergaard, M., Campbell, D.L., Orlando, T.P., Gustavsson, S. and Oliver, W.D., 2018. Tunable coupling scheme for implementing high-fidelity two-qubit gates. Physical Review Applied, 10(5), p.054062.'
+    expected = 'Yan, F., Krantz, P., Sung, Y., Kjaergaard, M., Campbell, D.L., Orlando, T.P., Gustavsson, S. and Oliver, W.D., 2018. Tunable coupling scheme for implementing high-fidelity two-qubit gates. Physical Review Applied, 10(5), p.054062.'
+
+    assert_equal(expected, PreprocessorPrimo.to_tacos(input))
+  end
+
+  test 'keyword? returns true for any contains phrase pattern' do
+    input = 'any,contains,popcorn anomoly'.split(',')
+
+    assert(PreprocessorPrimo.keyword?(input))
+  end
+
+  test 'keyword? returns false for input with more than 3 array elements' do
+    # NOTE: this query entering tacos would work... but it would have been cleaned up prior to running
+    # keyword? in our application via the normal flow
+    input = 'any,contains,popcorn anomoly: why life on the moon is complex, and other cat facts'.split(',')
+
+    assert_not(PreprocessorPrimo.keyword?(input))
+  end
+
+  test 'keyword? returns false for input with less than 3 array elements' do
+    input = 'any,contains'.split(',')
+
+    assert_not(PreprocessorPrimo.keyword?(input))
+  end
+
+  test 'keyword? returns false for non-any input' do
+    input = 'title,contains,popcorn anomoly'.split(',')
+
+    assert_not(PreprocessorPrimo.keyword?(input))
+  end
+
+  test 'keyword? returns true for non-contains inputs' do
+    # NOTE: this portion of they primo query focuses on how to handle the phrase. All the words, any of the words,
+    # the exact phrase, begins_with. For now we treat them all the same as standard keyword queries.
+    input = 'any,exact,popcorn anomoly'.split(',')
+
+    assert(PreprocessorPrimo.keyword?(input))
+  end
+
+  test 'extract keyword returns keyword for simple keywords' do
+    input = 'any,contains,popcorn anomoly'
+
+    assert_equal('popcorn anomoly', PreprocessorPrimo.extract_keyword(input))
+  end
+
+  test 'extract keyword returns keyword for simple non-contains keywords' do
+    input = 'any,exact,popcorn anomoly'
+
+    assert_equal('popcorn anomoly', PreprocessorPrimo.extract_keyword(input))
+  end
+
+  test 'extract keyword returns unhandled complex primo query for non-any searches' do
+    input = 'title,contains,popcorn anomoly'
+
+    assert_equal('unhandled complex primo query', PreprocessorPrimo.extract_keyword(input))
+  end
+
+  test 'extract keyword returns keyword for keywords with punctuation' do
+    input = 'any,contains,popcorn anomoly: a cats! life. on & mars!'
+
+    assert_equal('popcorn anomoly: a cats! life. on & mars!', PreprocessorPrimo.extract_keyword(input))
+  end
+
+  test 'extract keyword returns keyword for keywords with commas' do
+    input = 'any,contains,popcorn anomoly, and so can you'
+
+    assert_equal('popcorn anomoly, and so can you', PreprocessorPrimo.extract_keyword(input))
+  end
+
+  test 'extract keyword returns keyword for keywords with multiple commas and other punctuation' do
+    input = 'any,contains,popcorn anomoly: a cats! life. on & mars!, words, of {truth} (and, also not,)'
+
+    assert_equal('popcorn anomoly: a cats! life. on & mars!, words, of {truth} (and, also not,)',
+                 PreprocessorPrimo.extract_keyword(input))
+  end
+end