From 59e8f7a707fc7dfaf06abf46cfc21bb74d4140ec Mon Sep 17 00:00:00 2001 From: Jeremy Prevost Date: Fri, 12 Jul 2024 12:43:39 -0400 Subject: [PATCH] Adds Detector::Journal class Why are these changes being introduced: * Storing journal information locally will be used for detecting journal names in incoming search terms Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TCO-38 How does this address that need: * Creates ActiveRecord model Journal inside a new Detector namespace * It is likely we will move StandardIdentifers into the Detector namespace along with any new Detectors (Hints, etc) Document any side effects to this change: * The `partial_term_match` algorithm may be unwieldy once a large number of journals are loaded. We'll need to assess once we do that whether this can be wired in as part of live matching or just as an interesting data point to help us understand if this would be useful for live detections * The GraphQL endpoints will need a refactor to consider how to model the `Detections` rather than just the `StandardIdentifers`. That is separately ticketed. * Storing everything that isn't a journal name as json is new for us. This is partially done to allow us to not decide on the full scope of what to store before we understand what we need, and partially to allow us to explore what json tables may allow for us and to learn if/how indexing strategies need to change if we use json tables. * We have not decided on where to source our journal data from yet. We may find once we move forward with that some changes will be needed here, but since this is a very simple model it felt safe enough to move foward at this time. --- app/models/detector.rb | 9 ++++ app/models/detector/journal.rb | 53 +++++++++++++++++++ ...20240701205444_create_detector_journals.rb | 11 ++++ db/schema.rb | 10 +++- test/fixtures/detector/journals.yml | 29 ++++++++++ test/models/detector/journal_test.rb | 48 +++++++++++++++++ 6 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 app/models/detector.rb create mode 100644 app/models/detector/journal.rb create mode 100644 db/migrate/20240701205444_create_detector_journals.rb create mode 100644 test/fixtures/detector/journals.yml create mode 100644 test/models/detector/journal_test.rb diff --git a/app/models/detector.rb b/app/models/detector.rb new file mode 100644 index 0000000..5694035 --- /dev/null +++ b/app/models/detector.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +# Detectors are classes that implement various algorithms that allow us to identify patterns +# within search terms. +module Detector + def self.table_name_prefix + 'detector_' + end +end diff --git a/app/models/detector/journal.rb b/app/models/detector/journal.rb new file mode 100644 index 0000000..61962ae --- /dev/null +++ b/app/models/detector/journal.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: detector_journals +# +# id :integer not null, primary key +# name :string +# additional_info :json +# created_at :datetime not null +# updated_at :datetime not null +# +module Detector + # Detector::Journal stores information about academic journals loaded from external sources to allow us to check our + # incoming Terms against these information + class Journal < ApplicationRecord + before_save :downcase_fields! + + # Identify journals in which the incoming phrase matches a Journal.name exactly + # + # @note We always store the Journal.name downcased, so we should also always downcase the phrase + # when matching + # + # @note In reality, multiple Journals can exist with the same name. Therefore, we don't enforce + # unique names and don't expect a single Journal to be returned. + # + # @param phrase [String]. A string representation of a search term (not an actual Term object!) + # + # @return [Set of Detector::Journal] A set of ActiveRecord Detector::Journal relations. + def self.full_term_match(phrase) + Journal.where(name: phrase.downcase) + end + + # Identify journals in which the incoming phrase contains one or more Journal names + # + # @note This likely won't scale well and may not be suitable for live detection as it loads all Journal records. + # + # @param phrase [String]. A string representation of a search term (not an actual Term object!) + # + # @return [Set of Detector::Journal] A set of ActiveRecord Detector::Journal relations. + def self.partial_term_match(phrase) + Journal.all.map { |journal| journal if phrase.downcase.include?(journal.name) }.compact + end + + private + + # Downcasing all names before saving allows for more efficient matching by ensuring our index is lowercase. + # If we find we need the non-lowercase Journal name in the future, we could store that as `additional_info` json + def downcase_fields! + name.downcase! + end + end +end diff --git a/db/migrate/20240701205444_create_detector_journals.rb b/db/migrate/20240701205444_create_detector_journals.rb new file mode 100644 index 0000000..607d4d1 --- /dev/null +++ b/db/migrate/20240701205444_create_detector_journals.rb @@ -0,0 +1,11 @@ +class CreateDetectorJournals < ActiveRecord::Migration[7.1] + def change + create_table :detector_journals do |t| + t.string :name + t.json :additional_info + + t.timestamps + end + add_index :detector_journals, :name + end +end diff --git a/db/schema.rb b/db/schema.rb index 384ff92..3acaf32 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,15 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2024_06_21_132136) do +ActiveRecord::Schema[7.1].define(version: 2024_07_01_205444) do + create_table "detector_journals", force: :cascade do |t| + t.string "name" + t.json "additional_info" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["name"], name: "index_detector_journals_on_name" + end + create_table "metrics_algorithms", force: :cascade do |t| t.date "month" t.integer "doi" diff --git a/test/fixtures/detector/journals.yml b/test/fixtures/detector/journals.yml new file mode 100644 index 0000000..edec9e2 --- /dev/null +++ b/test/fixtures/detector/journals.yml @@ -0,0 +1,29 @@ +# == Schema Information +# +# Table name: detector_journals +# +# id :integer not null, primary key +# name :string +# additional_info :json +# created_at :datetime not null +# updated_at :datetime not null +# + +# Note: fixtures bypass ActiveRecord callbacks so while our model auto downcases titles, +# these fixtures will be stored mixed case unless they are all manually downcased here. +# Put another way, please make sure to always use downcase/lowercase for the 'name' in these fixtures +# to properly match the real behavior of the application. +nature: { + name: nature, + additional_info: {issns: ['0028-0836', '1476-4687']} +} + +the_new_england_journal_of_medicine: { + name: the new england journal of medicine, + additional_info: {issns: ['0028-4793', '1533-4406']} +} + +nature_medicine: { + name: nature medicine, + additional_info: {issns: ['1078-8956', '1546-170X']} +} diff --git a/test/models/detector/journal_test.rb b/test/models/detector/journal_test.rb new file mode 100644 index 0000000..cd68655 --- /dev/null +++ b/test/models/detector/journal_test.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: detector_journals +# +# id :integer not null, primary key +# name :string +# additional_info :json +# created_at :datetime not null +# updated_at :datetime not null +# +require 'test_helper' + +module Detector + class JournalTest < ActiveSupport::TestCase + test 'exact term match on journal name' do + expected = detector_journals('the_new_england_journal_of_medicine') + actual = Detector::Journal.full_term_match('the new england journal of medicine') + + assert actual.count == 1 + assert_equal(expected, actual.first) + end + + test 'mixed case exact term match on journal name' do + expected = detector_journals('the_new_england_journal_of_medicine') + actual = Detector::Journal.full_term_match('The New England Journal of Medicine') + + assert actual.count == 1 + assert_equal(expected, actual.first) + end + + test 'exact match within longer term returns no matches' do + actual = Detector::Journal.full_term_match('The New England Journal of Medicine, 1999') + assert actual.count.zero? + end + + test 'phrase match within longer term returns matches' do + actual = Detector::Journal.partial_term_match('words and stuff The New England Journal of Medicine, 1999') + assert actual.count == 1 + end + + test 'multple matches can happen with phrase matching within longer terms' do + actual = Detector::Journal.partial_term_match('words and stuff Nature medicine, 1999') + assert actual.count == 2 + end + end +end