Merge pull request #121 from MITLibraries/tco-106-phrases

Update Detectors to use phrase inputs instead of Terms
MITLibraries · Oct 17, 2024 · 78dd771 · 78dd771
2 parents 93d3fbd + 81d604d
commit 78dd771
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 92 deletions.
diff --git a/app/models/detector/citation.rb b/app/models/detector/citation.rb
@@ -5,9 +5,10 @@ class Detector
   # targeted at a particular citation format, but was designed based on characteristics of five formats: APA, MLA,
   # Chicago, Terabian, and IEEE.
   #
-  # It receives a Term object, which is parsed in various ways en route to calculating a final score. Terms with a
-  # higher score are more citation-like, while a score of 0 indicates a Term that has no hallmarks of being a citation.
-  # Terms whose score is higher than the REQUIRED_SCORE value can be registered as a Detection.
+  # It receives a phrase (often from `Term.phrase`), which is parsed in various ways en route to calculating a final
+  # score. Phrases with a higher score are more citation-like, while a score of 0 indicates a phrase that has no
+  # hallmarks of being a citation.
+  # Phrases whose score is higher than the REQUIRED_SCORE value can be registered as a Detection.
   class Citation
     attr_reader :score, :subpatterns, :summary
 
@@ -26,12 +27,13 @@ class Citation
       quotes: /&quot;.*?&quot;/
     }.freeze
 
-    # The required score value is the threshold needed for a Term to be officially recorded with a Detection.
+    # The required score value is the threshold needed for a phrase to be officially recorded with a Detection via it's
+    # associated Term.
     REQUIRED_SCORE = 6
 
     # Summary thresholds are used by the calculate_score method. This class counts the number of occurrences of specific
     # characters in the @summary instance variable. The thresholds here determine whether any of those counts are high
-    # enough to contribute to the Term's citation score.
+    # enough to contribute to the phrase's citation score.
     SUMMARY_THRESHOLDS = {
       characters: 25,
       colons: 2,
@@ -48,28 +50,31 @@ def detection?
       @score >= REQUIRED_SCORE
     end
 
-    # The initializer handles the parsing of a Term object, and subsequent population of the @subpatterns, @summary,
+    # The initializer handles the parsing of a phrase, and subsequent population of the @subpatterns, @summary,
     # and @score instance variables. @subpatterns contains all the citation components which have been flagged by the
-    # CITATION_PATTERNS hash. @summary contains counts of how often certain characters or words appear in the Term.
+    # CITATION_PATTERNS hash. @summary contains counts of how often certain characters or words appear in the phrase.
     # Finally, the @score value is a summary of how many elements in the subpatterns or summary report were detected.
     #
-    # @note This method can be called directly via Detector::Citation.new(Term). It is also called indirectly via the
+    # @note This method can be called directly via Detector::Citation.new(phrase). It is also called indirectly via the
     #       Detector::Citation.record(Term) instance method. This method can be called directly when a Detection is not
     #       desired.
-    def initialize(term)
+    # @param phrase String. Often a `Term.phrase`.
+    # @return Nothing intentional. Data is written to Hashes `@subpatterns`, `@summary`,
+    #   and `@score` during processing.
+    def initialize(phrase)
       @subpatterns = {}
       @summary = {}
-      pattern_checker(term.phrase)
-      summarize(term.phrase)
+      pattern_checker(phrase)
+      summarize(phrase)
       @score = calculate_score
     end
 
     # The record method first runs all of the parsers by running the initialize method. If the resulting score is higher
     # than the REQUIRED_SCORE value, then a Detection is registered.
-    #
+    # @param term [Term]
     # @return nil
     def self.record(term)
-      cit = Detector::Citation.new(term)
+      cit = Detector::Citation.new(term.phrase)
       return unless cit.detection?
 
       Detection.find_or_create_by(
@@ -90,7 +95,7 @@ def self.record(term)
     # if the brackets pattern finds two matches, it still only adds one to the final score.
     #
     # For the summary report, each value is compared with a threshold value in the SUMMARY_THRESHOLDS constant. The
-    # number of values which meet or exceed their threshold are added to the score. As an example, if a search term has
+    # number of values which meet or exceed their threshold are added to the score. As an example, if a search phrase has
     # five words, this value is compared to the word threshold (also five). Because the threshold is met, the score gets
     # incremented by one.
     #
@@ -103,61 +108,61 @@ def calculate_score
       summary_score + @subpatterns.length
     end
 
-    # This calculates the number of characters in the search term. It is called by the summarize method.
-    def characters(term)
-      term.length
+    # This calculates the number of characters in the search phrase. It is called by the summarize method.
+    def characters(phrase)
+      phrase.length
     end
 
-    # This counts the number of colons that appear in the search term, because they tend to appear more often in
+    # This counts the number of colons that appear in the search phrase, because they tend to appear more often in
     # citations than in other searches. It is called by the summarize method.
-    def colons(term)
-      term.count(':')
+    def colons(phrase)
+      phrase.count(':')
     end
 
-    # This counts the number of commas in the search term. It is called by the summarize method.
-    def commas(term)
-      term.count(',')
+    # This counts the number of commas in the search phrase. It is called by the summarize method.
+    def commas(phrase)
+      phrase.count(',')
     end
 
     # This builds one of the two main components of the Citation detector - the subpattern report. It uses each of the
     # regular expressions in the CITATION_PATTERNS constant, extracting all matches using the scan method.
     #
     # @return hash
-    def pattern_checker(term)
+    def pattern_checker(phrase)
       CITATION_PATTERNS.each_pair do |type, pattern|
-        @subpatterns[type.to_sym] = scan(pattern, term) if scan(pattern, term).present?
+        @subpatterns[type.to_sym] = scan(pattern, phrase) if scan(pattern, phrase).present?
       end
     end
 
-    # This counts the number of periods in the search term. It is called by the summarize method.
-    def periods(term)
-      term.count('.')
+    # This counts the number of periods in the search phrase. It is called by the summarize method.
+    def periods(phrase)
+      phrase.count('.')
     end
 
     # This is a convenience method for the scan method, which is used by pattern_checker.
-    def scan(pattern, term)
-      term.scan(pattern).map(&:strip)
+    def scan(pattern, phrase)
+      phrase.scan(pattern).map(&:strip)
     end
 
-    # This counts the semicolons in the search term. It is called by the summarize method.
-    def semicolons(term)
-      term.count(';')
+    # This counts the semicolons in the search phrase. It is called by the summarize method.
+    def semicolons(phrase)
+      phrase.count(';')
     end
 
     # This builds one of the two main components of the Citation detector - the summary report. It calls each of the
     # methods in the first line - which all return integers - and puts the result as a key-value pair in the @summary
     # instance variable.
     #
     # @return hash
-    def summarize(term)
+    def summarize(phrase)
       %w[characters colons commas periods semicolons words].each do |check|
-        @summary[check.to_sym] = send(check, term)
+        @summary[check.to_sym] = send(check, phrase)
       end
     end
 
-    # This counts the number of words in the search term. It is called by the summarize method.
-    def words(term)
-      term.split.length
+    # This counts the number of words in the search phrase. It is called by the summarize method.
+    def words(phrase)
+      phrase.split.length
     end
   end
 end
diff --git a/app/models/detector/lcsh.rb b/app/models/detector/lcsh.rb
@@ -12,9 +12,12 @@ class Lcsh
     # For now the initialize method just needs to run the pattern checker. A space for future development would be to
     # write additional methods to look up the detected LCSH for more information, and to confirm that the phrase is
     # actually an LCSH.
-    def initialize(term)
+    #
+    #   @param phrase String. Often a `Term.phrase`.
+    #   @return Nothing intentional. Data is written to Hash `@detections` during processing.
+    def initialize(phrase)
       @detections = {}
-      term_pattern_checker(term)
+      pattern_checker(phrase)
     end
 
     # The record method will consult the set of regex-based detectors that are defined in Detector::Lcsh. Any matches
@@ -41,10 +44,10 @@ def self.record(term)
 
     private
 
-    # term_patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH
+    # patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH
     # string. At the moment there is only one - for the separator character " -- " - but others might be possible if
     # there are regex-able vocabulary quirks which might separate subject values from non-subject values.
-    def term_patterns
+    def patterns
       {
         separator: /(.*)\s--\s(.*)/
       }

diff --git a/app/models/detector/pattern_checker.rb b/app/models/detector/pattern_checker.rb
@@ -4,9 +4,13 @@ class Detector
   # PatternChecker is intended to be added to Detectors via `include Detector::PatternChecker` to make
   # these methods available to instances of the class
   module PatternChecker
-    def term_pattern_checker(term)
-      term_patterns.each_pair do |type, pattern|
-        @detections[type.to_sym] = match(pattern, term) if match(pattern, term).present?
+    # pattern_checker iterates over all patterns defined in the calling object's `pattern` method.
+    #
+    #   @param phrase [String]. Often a `Term.phrase`.
+    #   @return Nothing intentional. Data is written to Hash `@detections` during processing.
+    def pattern_checker(phrase)
+      patterns.each_pair do |type, pattern|
+        @detections[type.to_sym] = match(pattern, phrase) if match(pattern, phrase).present?
       end
     end
 
@@ -15,8 +19,13 @@ def term_pattern_checker(term)
     # might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect
     # all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing
     # that change did result in unintended results so it was backed out for now.
-    def match(pattern, term)
-      pattern.match(term).to_s.strip
+    #
+    #   @param pattern Regexp
+    #   @param phrase String. Often a `Term.phrase`.
+    #
+    #   @return String
+    def match(pattern, phrase)
+      pattern.match(phrase).to_s.strip
     end
   end
 end
diff --git a/app/models/detector/standard_identifiers.rb b/app/models/detector/standard_identifiers.rb
@@ -13,9 +13,12 @@ def self.table_name_prefix
     # shared instance methods
     include Detector::PatternChecker
 
-    def initialize(term)
+    # Initialization process will run pattern checkers and strip invalid ISSN detections.
+    #   @param phrase String. Often a `Term.phrase`.
+    #   @return Nothing intentional. Data is written to Hash `@detections` during processing.
+    def initialize(phrase)
       @detections = {}
-      term_pattern_checker(term)
+      pattern_checker(phrase)
       strip_invalid_issns
     end
 
@@ -43,8 +46,8 @@ def self.record(term)
 
     private
 
-    # term_patterns are regex patterns to be applied to the basic search box input
-    def term_patterns
+    # patterns are regex patterns to be applied to the basic search box input
+    def patterns
       {
         isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
         issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/,