Skip to content

Commit

Permalink
Better documented the search algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
fbanados committed Nov 25, 2024
1 parent f4515f0 commit 91c7744
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 4 deletions.
2 changes: 2 additions & 0 deletions src/morphodict/search/espt.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(self, query, search_results):
self.search_results = search_results
self.query = query
self.query_analyzed_ok = False
self.tags = None

def convert_search_query_to_espt(self):
"""Analyze this search’s search_results query, possibly updating it.
Expand Down Expand Up @@ -80,6 +81,7 @@ def convert_search_query_to_espt(self):
tags=analyzed_query.tags,
new_tags=self.new_tags,
)
self.tags = analyzed_query.tags

def inflect_search_results(self):
if not self.query_analyzed_ok:
Expand Down
10 changes: 10 additions & 0 deletions src/morphodict/search/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,17 @@


def fetch_results(query: core.Query, search_results: core.SearchResults):
# First collect some candidate results via keywords.
# We split the query string into keywords, and collect all the entries that
# match exactly as keywords in the database, both source and target.

fetch_results_from_target_language_keywords(query, search_results)
fetch_results_from_source_language_keywords(query, search_results)

# Then we proceed to analyze the query, if successfull, we look for those
# entries in the dictionary that share the analysis with the FST result.
# This introduces source-level spelling relaxation if the FST supports it.

# Use the spelling relaxation to try to decipher the query
# e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" --
# thus, we can match "acâhkos" in the dictionary!
Expand Down Expand Up @@ -51,6 +59,8 @@ def fetch_results(query: core.Query, search_results: core.SearchResults):
# fst_analyses has now been thinned by calls to `fst_analyses.remove()`
# above; remaining items are analyses which are not in the database,
# although their lemmas should be.
#
# Therefore, we will make on the go the extra entries.
for analysis in fst_analyses:
# When the user query is outside of paradigm tables
# e.g. mad preverb and reduplication: ê-mâh-misi-nâh-nôcihikocik
Expand Down
7 changes: 4 additions & 3 deletions src/morphodict/search/pos_matches.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from morphodict.search.core import SearchResults
from morphodict.phrase_translate.fst import source_phrase_analyses
from morphodict.search.espt import EsptSearch
from morphodict.analysis import rich_analyze_relaxed


def find_pos_matches(search_results: SearchResults) -> None:
if len(search_results.verbose_messages) <= 1:
def find_pos_matches(tag_source: EsptSearch | None, search_results: SearchResults) -> None:
if not tag_source:
return
tags = search_results.verbose_messages[1].get("tags")
tags = tag_source.tags
[pos_match(result, tags) for result in search_results.unsorted_results()]


Expand Down
19 changes: 18 additions & 1 deletion src/morphodict/search/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def search(
espt_search = EsptSearch(search_query, search_results)
espt_search.convert_search_query_to_espt()

# Now, check if we were asked to do only vector distance results, and if so,
# compute them and return them:
if settings.MORPHODICT_ENABLE_CVD:
cvd_search_type: CvdSearchType = first_non_none_value(
search_query.cvd, default=CvdSearchType.DEFAULT
Expand All @@ -64,8 +66,13 @@ def sort_by_cvd(r: Result):
do_cvd_search(search_query, search_results)
return search_results

# We were NOT asked for only vector distance results, so now we actually
# go and perform the search.

# First, fetch keyword-based and FST-based orthography-relaxed results
fetch_results(search_query, search_results)

# If allowed, add affix search candidates
if (
settings.MORPHODICT_ENABLE_AFFIX_SEARCH
and include_affixes
Expand All @@ -74,21 +81,31 @@ def sort_by_cvd(r: Result):
do_source_language_affix_search(search_query, search_results)
do_target_language_affix_search(search_query, search_results)

# Now, if we wanted to do vector search (not exclusively), add the results.
if settings.MORPHODICT_ENABLE_CVD:
if cvd_search_type.should_do_search() and not is_almost_certainly_cree(
search_query, search_results
):
do_cvd_search(search_query, search_results)

# If we did an english phrase search, we have to inflect back the results!
if (search_query.espt or inflect_english_phrases) and (
len(initial_query_terms) > 1
):
espt_search.inflect_search_results()

find_pos_matches(search_results)
# Annotate every entry in search results with the POS match when that is available
if espt_search:
find_pos_matches(espt_search, search_results)

# Annotate every entry with a frequency count from the glossary
get_glossary_count(search_results)

# Annotate every entry with a lemma frequency from lemma_frequency.txt
get_lemma_freq(search_results)

# Return. NOTE THAT WE HAVE NOT SORTED RESULTS YET!
# This will be done when we call sorted_results
return search_results


Expand Down

0 comments on commit 91c7744

Please sign in to comment.