Better documented the search algorithm

UAlbertaALTLab · Nov 25, 2024 · 91c7744 · 91c7744
1 parent f4515f0
commit 91c7744
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 4 deletions.
diff --git a/src/morphodict/search/espt.py b/src/morphodict/search/espt.py
@@ -43,6 +43,7 @@ def __init__(self, query, search_results):
         self.search_results = search_results
         self.query = query
         self.query_analyzed_ok = False
+        self.tags = None
 
     def convert_search_query_to_espt(self):
         """Analyze this search’s search_results query, possibly updating it.
@@ -80,6 +81,7 @@ def convert_search_query_to_espt(self):
             tags=analyzed_query.tags,
             new_tags=self.new_tags,
         )
+        self.tags = analyzed_query.tags
 
     def inflect_search_results(self):
         if not self.query_analyzed_ok:

diff --git a/src/morphodict/search/lookup.py b/src/morphodict/search/lookup.py
@@ -21,9 +21,17 @@
 
 
 def fetch_results(query: core.Query, search_results: core.SearchResults):
+    # First collect some candidate results via keywords.
+    # We split the query string into keywords, and collect all the entries that
+    # match exactly as keywords in the database, both source and target.
+
     fetch_results_from_target_language_keywords(query, search_results)
     fetch_results_from_source_language_keywords(query, search_results)
 
+    # Then we proceed to analyze the query, if successfull, we look for those 
+    # entries in the dictionary that share the analysis with the FST result.
+    # This introduces source-level spelling relaxation if the FST supports it.
+
     # Use the spelling relaxation to try to decipher the query
     #   e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" --
     #         thus, we can match "acâhkos" in the dictionary!
@@ -51,6 +59,8 @@ def fetch_results(query: core.Query, search_results: core.SearchResults):
     # fst_analyses has now been thinned by calls to `fst_analyses.remove()`
     # above; remaining items are analyses which are not in the database,
     # although their lemmas should be.
+    #
+    # Therefore, we will make on the go the extra entries.
     for analysis in fst_analyses:
         # When the user query is outside of paradigm tables
         # e.g. mad preverb and reduplication: ê-mâh-misi-nâh-nôcihikocik

diff --git a/src/morphodict/search/pos_matches.py b/src/morphodict/search/pos_matches.py
@@ -1,12 +1,13 @@
 from morphodict.search.core import SearchResults
 from morphodict.phrase_translate.fst import source_phrase_analyses
+from morphodict.search.espt import EsptSearch
 from morphodict.analysis import rich_analyze_relaxed
 
 
-def find_pos_matches(search_results: SearchResults) -> None:
-    if len(search_results.verbose_messages) <= 1:
+def find_pos_matches(tag_source: EsptSearch | None, search_results: SearchResults) -> None:
+    if not tag_source:
         return
-    tags = search_results.verbose_messages[1].get("tags")
+    tags = tag_source.tags
     [pos_match(result, tags) for result in search_results.unsorted_results()]
 
 

diff --git a/src/morphodict/search/runner.py b/src/morphodict/search/runner.py
@@ -49,6 +49,8 @@ def search(
         espt_search = EsptSearch(search_query, search_results)
         espt_search.convert_search_query_to_espt()
 
+    # Now, check if we were asked to do only vector distance results, and if so,
+    # compute them and return them:
     if settings.MORPHODICT_ENABLE_CVD:
         cvd_search_type: CvdSearchType = first_non_none_value(
             search_query.cvd, default=CvdSearchType.DEFAULT
@@ -64,8 +66,13 @@ def sort_by_cvd(r: Result):
             do_cvd_search(search_query, search_results)
             return search_results
 
+    # We were NOT asked for only vector distance results, so now we actually
+    # go and perform the search.
+
+    # First, fetch keyword-based and FST-based orthography-relaxed results
     fetch_results(search_query, search_results)
 
+    # If allowed, add affix search candidates
     if (
         settings.MORPHODICT_ENABLE_AFFIX_SEARCH
         and include_affixes
@@ -74,21 +81,31 @@ def sort_by_cvd(r: Result):
         do_source_language_affix_search(search_query, search_results)
         do_target_language_affix_search(search_query, search_results)
 
+    # Now, if we wanted to do vector search (not exclusively), add the results.
     if settings.MORPHODICT_ENABLE_CVD:
         if cvd_search_type.should_do_search() and not is_almost_certainly_cree(
             search_query, search_results
         ):
             do_cvd_search(search_query, search_results)
 
+    # If we did an english phrase search, we have to inflect back the results!
     if (search_query.espt or inflect_english_phrases) and (
         len(initial_query_terms) > 1
     ):
         espt_search.inflect_search_results()
 
-    find_pos_matches(search_results)
+    # Annotate every entry in search results with the POS match when that is available 
+    if espt_search:
+        find_pos_matches(espt_search, search_results)
+
+    # Annotate every entry with a frequency count from the glossary
     get_glossary_count(search_results)
+
+    # Annotate every entry with a lemma frequency from lemma_frequency.txt
     get_lemma_freq(search_results)
 
+    # Return. NOTE THAT WE HAVE NOT SORTED RESULTS YET!
+    # This will be done when we call sorted_results
     return search_results