Skip to content

Commit

Permalink
Merge pull request #110 from MITLibraries/tco-83-categorization
Browse files Browse the repository at this point in the history
Implement Categorization
  • Loading branch information
matt-bernhardt authored Oct 1, 2024
2 parents d362bc5 + 590988a commit 862a979
Show file tree
Hide file tree
Showing 17 changed files with 471 additions and 20 deletions.
14 changes: 14 additions & 0 deletions app/graphql/types/categories_type.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# frozen_string_literal: true

module Types
class CategoriesType < Types::BaseObject
description 'Information about one category linked to this search term'

field :confidence, Float, null: false, description: 'The application\'s confidence that the term belongs to this category - measured from 0.0 to 1.0'
field :name, String, null: false, description: 'The name of this category'

def name
@object.category.name
end
end
end
1 change: 1 addition & 0 deletions app/graphql/types/query_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def nodes(ids:)

def log_search_event(search_term:, source_system:)
term = Term.create_or_find_by!(phrase: search_term)
term.calculate_categorizations
term.search_events.create!(source: source_system)
end

Expand Down
5 changes: 5 additions & 0 deletions app/graphql/types/search_event_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

module Types
class SearchEventType < Types::BaseObject
field :categories, [Types::CategoriesType], description: 'The list of categories linked to term provided in this search'
field :created_at, GraphQL::Types::ISO8601DateTime, null: false
field :detectors, Types::DetectorsType
field :id, ID, null: false
Expand All @@ -14,6 +15,10 @@ def phrase
@object.term.phrase
end

def categories
@object.term.categorizations
end

def detectors
@object.term.phrase
end
Expand Down
5 changes: 5 additions & 0 deletions app/graphql/types/term_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

module Types
class TermType < Types::BaseObject
field :categories, [Types::CategoriesType], description: 'The list of categories linked to this term'
field :created_at, GraphQL::Types::ISO8601DateTime, null: false
field :detectors, Types::DetectorsType
field :id, ID, null: false
Expand All @@ -14,6 +15,10 @@ def occurence_count
@object.search_events.count
end

def categories
@object.categorizations
end

def detectors
@object.phrase
end
Expand Down
41 changes: 41 additions & 0 deletions app/models/categorization.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# frozen_string_literal: true

# A Categorization is a joining record between a Term and a Category, created when a set of Detections are summarized to
# calculate the final confidence that a Term belongs to a specific Category.
#
# There is a uniqueness constraint on the combination of term_id, category_id, and detector_version.
#
# New records can be created by passing a Term, Category, and a confidence score. The model will look up the current
# detector version, and include that in the record.
#
# == Schema Information
#
# Table name: categorizations
#
# id :integer not null, primary key
# category_id :integer not null
# term_id :integer not null
# confidence :float
# detector_version :string
# created_at :datetime not null
# updated_at :datetime not null
#
class Categorization < ApplicationRecord
belongs_to :term
belongs_to :category

# We use the before_create hook to prevent needing to override the initialize method, which Rails frowns upon.
before_create :set_defaults

# These scopes allow for easy filtering of Categorization records by a single parameter.
scope :current, -> { where(detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')) }

private

# This looks up the current Detector Version from the environment, storing the value as part of the record which is
# about to be saved. This prevents the rest of the application from having to worry about this value, while also
# providing a mechanism to prevent duplicate records from being created.
def set_defaults
self.detector_version = ENV.fetch('DETECTOR_VERSION', 'unset')
end
end
1 change: 1 addition & 0 deletions app/models/category.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
class Category < ApplicationRecord
has_many :detector_categories, dependent: :destroy
has_many :detectors, through: :detector_categories
has_many :categorizations, dependent: :destroy
end
14 changes: 14 additions & 0 deletions app/models/detection.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ class Detection < ApplicationRecord
scope :for_detector, ->(detector) { where(detector_id: detector.id) }
scope :for_term, ->(term) { where(term_id: term.id) }

# The scores method returns a compact representation of the categories, and associated confidence values, for a given
# detection. It looks up the referenced Detector, then extracts all the associated Categories (and the confidence
# values of those associations).
#
# This structure is summarized further in the Term model.
#
# @note For a simple example, if a detector is only linked to one category (category_id of 2) with a confidence of
# 0.95, this method will return [ { 2 => 0.95 } ].
#
# @return array of hashes, e.g. [ { 1 => 0.4 }, { 2 => 0.95 } ]
def scores
detector.detector_categories.map { |dc| { dc.category_id => dc.confidence } }
end

private

# This looks up the current Detector Version from the environment, storing the value as part of the record which is
Expand Down
45 changes: 45 additions & 0 deletions app/models/term.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
class Term < ApplicationRecord
has_many :search_events, dependent: :destroy
has_many :detections, dependent: :destroy
has_many :categorizations, dependent: :destroy

# The record_detections method is the one-stop method to call every Detector's record method that is defined within
# the application.
Expand All @@ -27,4 +28,48 @@ def record_detections

nil
end

# Receives an array of individual confidence values, and returns the calculated categorization score.
#
# @note For now, we are just calculating the average of all confidences, but this was chosen arbitrarily. This will
# need to be studied more rigorously when we have more data.
#
# @return float
def calculate_confidence(values)
(values.sum / values.size).round(2)
end

# The combined_scores method queries all current detections' confidence scores, and remaps them to a structure that
# is easy to summarize to categorization scores.
#
# @return array of hashes, e.g. [ { 3 => [ 0.95, 0.95 ] }, { 1 => [ 0.1 ] } ]
def calculate_categorizations
record_detections
scores = retrieve_detection_scores
# scores looks like [{3=>[0.91, 0.95]}, {1=>[0.1]}]
scores.map do |obj|
obj.map do |cat, vals|
Categorization.find_or_create_by(
term: self,
category: Category.where(id: cat).first,
confidence: calculate_confidence(vals)
)
end
end
end

private

# This method looks up all current detections for the given term, and assembles their confidence scores in a format
# usable by the calculate_categorizations method. It exists to transform data like:
# [{3=>0.91}, {1=>0.1}] and [{3=>0.95}]
# into [{3=>[0.91, 0.95]}, {1=>[0.1]}]
#
# @return an array of hashes, e.g. [{3=>[0.91, 0.95]}, {1=>[0.1]}]
def retrieve_detection_scores
# The detections.scores method returns data like [{3=>0.91}, {1=>0.1}] and [{3=>0.95}]
raw = detections.current.flat_map(&:scores)
# raw looks like [{3=>0.91}, {1=>0.1}, {3=>0.95}]
raw.group_by { |h| h.keys.first }.map { |k, v| { k => v.map { |h| h.values.first } } }
end
end
14 changes: 14 additions & 0 deletions db/migrate/20240923182249_create_categorization.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class CreateCategorization < ActiveRecord::Migration[7.1]
def change
create_table :categorizations do |t|
t.belongs_to :category, null: false, foreign_key: true
t.belongs_to :term, null: false, foreign_key: true
t.float :confidence
t.string :detector_version

t.timestamps
end
add_index :categorizations, [:term_id, :category_id, :detector_version], unique: true
add_index :categorizations, [:category_id, :term_id, :detector_version], unique: true
end
end
17 changes: 16 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 53 additions & 19 deletions docs/reference/classes.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,22 @@ classDiagram
Term "1" --> "0..*" Categorization
Detection "0..*" --> "1" Detector
DetectionCategory "0..*" --> "1" Category
DetectorCategory "0..*" --> "1" Category
Categorization "0..*" --> "1" Category
Detector "1" --> "0..*" DetectionCategory
Detector "1" --> "0..*" DetectorCategory
class User
User: +String uid
User: +String email
User: +Boolean admin
class Term
Term: id
Term: +String phrase
Term: calculateCategory()
Term: combinedScores()
Term: recordDetections()
Term: recordPatterns()
Term: recordJouranls()
Term: recordSuggestedResources()
class SearchEvent
SearchEvent: +Integer id
Expand All @@ -53,13 +50,11 @@ classDiagram
Detection: current()
Detection: for_detector()
Detection: for_term()
Detection: scores()
class Detector
Detector: +Integer id
Detector: +String name
Detector: +Float confidence
Detector: incrementConfidence()
Detector: decrementConfidence()
class Category
Category: +Integer id
Expand All @@ -69,21 +64,60 @@ classDiagram
Categorization: +Integer category_id
Categorization: +Integer term_id
Categorization: +Float confidence
class DetectionCategory
DetectionCategory: +Integer id
DetectionCategory: +Integer detector_id
DetectionCategory: +Integer category_id
DetectionCategory: +Float confidence
DetectionCategory: incrementConfidence()
DetectionCategory: decrementConfidence()
Categorization: +String detector_version
Categorization: current()
class DetectorCategory
DetectorCategory: +Integer id
DetectorCategory: +Integer detector_id
DetectorCategory: +Integer category_id
DetectorCategory: +Float confidence
DetectorCategory: incrementConfidence()
DetectorCategory: decrementConfidence()
class DetectorJournal
DetectorJournal: full_term_match()
DetectorJournal: partial_term_match()
DetectorJournal: record()
class DetectorStandardIdentifier
DetectorStandardIdentifier: record()
class DetectorSuggestedResource
DetectorSuggestedResource: bulk_replace()
DetectorSuggestedResource: calculate_fingerprint()
DetectorSuggestedResource: full_term_match()
DetectorSuggestedResource: record()
DetectorSuggestedResource: update_fingerprint()
namespace SearchActivity{
class Term
class SearchEvent
}
namespace KnowledgeGraph{
class Detectors
class DetectorCategory
class Category
}
namespace Detectors {
class Detector
class DetectorJournal["Detector::Journal"]
class DetectorStandardIdentifier["Detector::StandardIdentifiers"]
class DetectorSuggestedResource["Detector::SuggestedResource"]
}
style SearchEvent fill:#000,stroke:#66c2a5,color:#66c2a5,stroke-width:4px;
style Term fill:#000,stroke:#66c2a5,color:#66c2a5,stroke-width:4px;
style Category fill:#000,stroke:#fc8d62,color:#fc8d62
style DetectionCategory fill:#000,stroke:#fc8d62,color:#fc8d62
style DetectorCategory fill:#000,stroke:#fc8d62,color:#fc8d62
style Detector fill:#000,stroke:#fc8d62,color:#fc8d62
style DetectorJournal fill:#000,stroke:#fc8d62,color:#fc8d62
style DetectorStandardIdentifier fill:#000,stroke:#fc8d62,color:#fc8d62
style DetectorSuggestedResource fill:#000,stroke:#fc8d62,color:#fc8d62
style Categorization fill:#000,stroke:#8da0cb,color:#8da0cb,stroke-dasharray: 3 5;
style Detection fill:#000,stroke:#8da0cb,color:#8da0cb,stroke-dasharray: 3 5;
Expand Down
32 changes: 32 additions & 0 deletions test/controllers/graphql_controller_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,36 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest
assert_nil(json['data']['logSearchEvent']['detectors']['standardIdentifiers'].first['details']['authors'])
end
end

test 'search event query can return categorization details for searches that trip a detector' do
post '/graphql', params: { query: '{
logSearchEvent(sourceSystem: "timdex", searchTerm: "https://doi.org/10.1080/10509585.2015.1092083.") {
categories {
name
confidence
}
}
}' }

json = response.parsed_body

assert_equal 'Transactional', json['data']['logSearchEvent']['categories'].first['name']
assert_in_delta 0.95, json['data']['logSearchEvent']['categories'].first['confidence']
end

test 'term lookup query can return categorization details for searches that trip a detector' do
post '/graphql', params: { query: '{
lookupTerm(searchTerm: "10.1016/j.physio.2010.12.004") {
categories {
name
confidence
}
}
}' }

json = response.parsed_body

assert_equal 'Transactional', json['data']['lookupTerm']['categories'].first['name']
assert_in_delta 0.95, json['data']['lookupTerm']['categories'].first['confidence']
end
end
Loading

0 comments on commit 862a979

Please sign in to comment.