[Search] Case sensitive across engines (#309)

* [Search] Add case-sensitivity option * [Search] Reworked search function so that it is readable and documented * [Tests] Reworked the test so that we check the results content * [SQLite] Add Case Sensitivity to LIKEs for SQLite --------- Co-authored-by: Thibault Clérice <[email protected]>
hipster-philology · Mar 14, 2024 · 2ca595d · 2ca595d
1 parent 139b6f2
commit 2ca595d
Show file tree

Hide file tree

Showing 6 changed files with 155 additions and 65 deletions.
diff --git a/app/__init__.py b/app/__init__.py
@@ -8,8 +8,11 @@
 from flask_sqlalchemy import SQLAlchemy
 from flask_wtf import CSRFProtect
 from flask_babel import Babel
+from sqlalchemy.engine import Engine
 from .ext_config import get_locale
 from .markdown_ext import Markdown
+from sqlite3 import Connection as SQLite3Connection
+
 
 basedir = os.path.abspath(os.path.dirname(__file__))
 
@@ -38,7 +41,15 @@ def create_app(config_name="dev"):
         app.config.from_object(config)
     else:
         app.config.from_object(config[config_name])
-
+
+    # SQLite does not perform CASE SENSITIVE LIKEs by default.
+    if app.config["SQLALCHEMY_DATABASE_URI"].startswith("sqlite:"):
+        @db.event.listens_for(Engine, "connect")
+        def _set_sqlite_case_insensitive_pragma(dbapi_con, connection_record):
+            """ This ensures that SQLite is not case-insensitive when using LIKEs"""
+            if isinstance(dbapi_con, SQLite3Connection):
+                dbapi_con.execute("PRAGMA case_sensitive_like=ON;")
+
     app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
 
     config[config_name].init_app(app)

diff --git a/app/main/views/tokens.py b/app/main/views/tokens.py
@@ -7,6 +7,9 @@
 from sqlalchemy import func
 import math
 from csv import DictWriter
+from io import StringIO
+from itertools import product
+from typing import Dict, Optional, List, Tuple
 
 from .utils import render_template_with_nav_info, request_wants_json, requires_corpus_access
 from .. import main
@@ -15,10 +18,7 @@
 from ...utils.forms import string_to_none, strip_or_none, column_search_filter, prepare_search_string
 from ...utils.pagination import int_or
 from ...utils.tsv import TSV_CONFIG, stream_tsv
-from ...utils.tsv import TSV_CONFIG, stream_tsv
 from ...utils.response import stream_template
-from io import StringIO
-from itertools import product
 
 
 @main.route('/corpus/<int:corpus_id>/tokens/correct')
@@ -271,47 +271,59 @@ def tokens_search_through_fields(corpus_id):
     :param corpus_id: Id of the corpus
     """
     corpus = Corpus.query.get_or_404(corpus_id)
+    # test suppression:
     if not corpus.has_access(current_user):
         abort(403)
-
+    # nom des colonnes disponibles pour le corpus (POS, form, etc)
     columns = tuple(["form"] + [
         col if col == "POS" else col.lower()
         for col in corpus.get_columns_headings()
     ])
 
-    input_values = {}
+    input_values: Dict[str, Optional[str]] = {}
 
-    # make a dict with values splitted for each OR operator
-    fields = {}
-    source_dict = request.form if request.method == "POST" else request.args
+    # make a dict with values split for each OR operator
+    fields: Dict[str, List[str]] = {}
+    source_dict: Dict[str, str] = request.form if request.method == "POST" else request.args
 
     for name in columns:
-        value = strip_or_none(source_dict.get(name))
+        value: Optional[str] = strip_or_none(source_dict.get(name))
         input_values[name] = value
 
         # split values with the '|' OR operator but keep escaped '\|' ones
-        fields[name] = prepare_search_string(value) if value is not None else ""
+        if value:
+            fields[name] = prepare_search_string(value)
 
     # all search combinations
-    search_branches = [
-        dict(prod)
-        for prod in product(*[
+    flat_fields: List[List[Tuple[str, str]]] = [
             [
                 (field, value)
                 for value in fields[field]
             ]
             for field in fields
-        ])
-    ]
+        ]
+    # Création combinaison de recherches possibles pipe product
+    # If source_dict = {"POS": "NOM|VER", "lemma": "mang*"}
+    # Then flat_fields = [[("POS", "NOM"), ("POS", "VER")], [("lemma", "mang*")]]
+    # And search_branches :
+    #    [{"POS": "NOM", "lemma": "mang*"}, {"POS": "VER", "lemma": "mang*"}]
+    # * => flat_fields = [["a", "b"], ["c"]]
+    # product(*flat_fields) == product(flat_fields[0], flat_fields[1])
+    search_branches: List[Dict[str, str]] = [dict(prod) for prod in product(*flat_fields)]
 
     value_filters = []
+    case_insensitive = True
+    if 'caseBox' in source_dict:
+        case_insensitive = False
     # for each branch filter (= OR clauses if any)
     for search_branch in search_branches:
+        # filtre minimal = bon corpus (id)
         branch_filters = [WordToken.corpus == corpus_id]
 
         # for each field (lemma, pos, form, morph)
         for name, value in search_branch.items():
-            branch_filters.extend(column_search_filter(getattr(WordToken, name), value))
+            # transformation couple clé valeur en filtre SQLalchemy
+            branch_filters.extend(column_search_filter(getattr(WordToken, name), value, case_sensitive=case_insensitive))
 
         value_filters.append(branch_filters)
 

diff --git a/app/templates/main/tokens_search_through_fields.html b/app/templates/main/tokens_search_through_fields.html
@@ -59,7 +59,7 @@ <h1 class="mt-3"><i class="fa fa-search mr-2"></i>{{ _('Corpus') }} {{ corpus.na
             </tr>
             </tbody>
         </table>
-
+        <input type="checkbox" name="caseBox" id="caseBox"/> Deactivate case sensitivity (majuscule and minuscule are taken into consideration in the search).
         <div class="mb-3">
             <small class="form-text text-muted">{{ _('* can be used to match partial words, eg.') }} <b>ADV*</b></small>
             <small class="form-text text-muted">{{ _('! can be used to negate a match, eg.') }} <b>!PRE</b></small>

diff --git a/app/utils/forms.py b/app/utils/forms.py
@@ -1,10 +1,15 @@
+from typing import List, Optional
 from csv import DictReader
+from sqlalchemy import func
 
 from app.utils import StringDictReader
 from app.utils.tsv import TSV_CONFIG
+from sqlalchemy.sql import ColumnExpressionArgument, ColumnElement
 
 
-def string_to_none(string):
+def string_to_none(string: Optional[str]) -> Optional[str]:
+    """ Converts a string to None, including a string marked as None
+    """
     if string is None:
         return
     elif string.strip() == "None":
@@ -13,13 +18,14 @@ def string_to_none(string):
         return string
 
 
-def strip_or_none(string):
+def strip_or_none(string: Optional[str]) -> Optional[str]:
+    """Strip a string if it is not none"""
     if string is not None:
         return string.strip()
     return string
 
 
-def prepare_search_string(string: str) -> list:
+def prepare_search_string(string: str) -> List[str]:
     """ Transform a search string into a list of strings if "|" was used inside the string
 
     Agrees with escaped pipes.
@@ -32,52 +38,85 @@ def prepare_search_string(string: str) -> list:
     return value
 
 
-def column_search_filter(field, value: str) -> list:
+def column_search_filter(
+        field: ColumnElement,
+        value: str,
+        case_sensitive: bool = True) -> List[ColumnExpressionArgument]:
     """ Based on a field name and a string value, computes the list of search WHERE that needs to be \
     applied to a query
 
     :param field: ORM Field Property
     :param value: Search String
+    :param case_sensitive: Enable case sensitivity
     :return: List of WHERE clauses
     """
     branch_filters = []
-    if len(value) > 0:
-        value = value.replace(" ", "")
-        # escape search operators
-        value = value.replace('%', '\\%')
-        value = value.replace('\\*', '¤$¤')
-        value = value.replace('\\!', '¤$$¤')
-
-        value = string_to_none(value)
-        # distinguish LIKE from EQ
-        if value is not None and "*" in value:
-            value = value.replace("*", "%")
-            # unescape '\*'
-            value = value.replace('¤$¤', '*')
-
-            if value.startswith("!") and len(value) > 1:
-                value = value[1:]
-                branch_filters.append(field.notlike(value, escape='\\'))
-            else:
-                # unescape '\!'
-                value = value.replace('¤$$¤', '!')
-                branch_filters.append(field.like(value, escape='\\'))
+    if not value:
+        return []
+
+    # Clean-up the string
+    value = value.replace(" ", "")
+    # Escape search operators from LIKE
+    value = value.replace('%', '\\%')
+    # Replace * and ! which are escaped, so that they are not treated as wildcard or NOTs.
+    value = value.replace('\\*', '¤$¤')
+    value = value.replace('\\!', '¤$$¤')
+    value = string_to_none(value)
+
+    # If all operation produced an empty string, return an empty list
+    if not value or value == "!":
+        return []
+
+    # If we are case-sensitive, we keep using like and or not like
+    if case_sensitive:
+        notlike = lambda x: field.notlike(x, escape="\\")
+        like = lambda x: field.like(x, escape="\\")
+        eq_field = field
+        eq_value = lambda x: x
+    else:
+        notlike = lambda x: field.notilike(x, escape="\\")
+        like = lambda x: field.ilike(x, escape="\\")
+        eq_field = func.lower(field)
+        eq_value = lambda x: x.lower()
+
+    # distinguish LIKE from EQ when wild cards are used
+    if value is not None and "*" in value:
+        # Replace unescaped * as LIKE operator wildcards
+        value = value.replace("*", "%")
+        # Re-introduce previously escaped * (as ¤$¤) as the search character *
+        value = value.replace('¤$¤', '*')
+
+        # Then we check if we are in a not or not request
+        if value.startswith("!") and len(value) > 1:
+            value = value[1:]
+            operator = notlike
         else:
-            # unescape '\*'
-            value = value.replace('¤$¤', '*')
-
-            if value is not None and value.startswith("!") and len(value) > 1:
-                value = value[1:]
-                branch_filters.append(field != value)
-            else:
-                # unescape '\!'
-                value = value.replace('¤$$¤', '!')
-                branch_filters.append(field == value)
+            operator = like
+    else:
+        # Re-introduce previously escaped * (as ¤$¤) as the search character *
+        value = value.replace('¤$¤', '*')
+
+        # Then we check if we are in a not or not request
+        if value.startswith("!") and len(value) > 1:
+            value = value[1:]
+            operator = eq_field.__ne__
+        else:
+            operator = eq_field.__eq__
+
+        value = eq_value(value)
+
+    # Re-introduce previously escaped ! (as ¤$$¤) as the search character !
+    value = value.replace('¤$$¤', '!')
+    branch_filters.append(operator(value))
     return branch_filters
 
 
-def read_input_lemma(values):
-    return [x.replace('\r', '') for x in values.split("\n") if len(x.replace('\r', '').strip()) > 0]
+def read_input_lemma(values: str) -> List[str]:
+    return [
+        x.replace('\r', '')
+        for x in values.split("\n")
+        if len(x.replace('\r', '').strip()) > 0
+    ]
 
 
 def read_input_morph(values):
@@ -124,4 +163,4 @@ def create_input_format_convertion(tokens, allowed_lemma, allowed_morph, allowed
     if tokens:
         tokens = read_input_tokens(tokens)
 
-    return tokens, allowed_lemma, allowed_morph, allowed_POS
+    return tokens, allowed_lemma, allowed_morph, allowed_POS
diff --git a/tests/test_selenium/base.py b/tests/test_selenium/base.py
@@ -706,12 +706,11 @@ def setUp(self):
         db.session.add(new_token)
         db.session.commit()
 
-    def search(self, form="", lemma="", pos="", morph=""):
-
+    def search(self, form="", lemma="", pos="", morph="", case_insensitivity=False):
         self.go_to_search_tokens_page(TokensSearchThroughFieldsBase.CORPUS_ID, as_callback=False)
-
         self.fill_filter_row(form, lemma, pos, morph)
-
+        if case_insensitivity:
+            self.driver_find_element_by_id('caseBox').click()
         self.driver_find_element_by_id("submit_search").click()
 
         result = []

diff --git a/tests/test_selenium/test_tokens_search_through_fields.py b/tests/test_selenium/test_tokens_search_through_fields.py
@@ -1,6 +1,7 @@
 from app.models import WordToken
+from app import db
 from tests.test_selenium.base import TokensSearchThroughFieldsBase
-
+from typing import List, Dict, Set
 
 class TestTokensSearchThroughFields(TokensSearchThroughFieldsBase):
     """ Test searching tokens through fields (Form, Lemma, POS, Morph) within a corpus """
@@ -158,14 +159,42 @@ def test_search_with_negation_and_like_operator(self):
     def test_search_with_or_operator(self):
         # search with OR operator
         rows = self.search(form="seint|seinz|Seinz|seinte")
-        rows_wildcard = self.search(form="sein*")
+        rows_wildcard = self.search(form="sein*", case_insensitivity=True)
+
         rows_lemma = self.search(lemma="saint")
-        self.assertTrue(rows_lemma == rows and rows == rows_wildcard)
+        self.assertEqual(rows_lemma, rows)
+        self.assertEqual(rows_wildcard, rows)
 
-        # test combination with an other field
+        # test combination with another field
         rows = self.search(lemma="m*", pos="NOMcom|NOMpro")
         self.assertTrue(len(rows) == 9)
 
-        # test combination with an other field
+        # test combination with another field
         rows = self.search(form="Martins|mere", lemma="martin|mere")
         self.assertTrue(len(rows) == 3)
+
+    def test_search_with_case_sensitivy(self):
+        """Test a simple enable case sensitivity"""
+        # search with and without case sensitivity
+        rows_case_sensitivity_min = self.search(form="de")
+        row_case_sensitivity_maj = self.search(form="De")
+        rows_case_insensitivity = self.search(form="de", case_insensitivity=True)
+
+        def form_only(results: List[Dict[str, str]]) -> Set[str]:
+            return set([line["form"] for line in results])
+
+        self.assertEqual(
+            form_only(rows_case_sensitivity_min), {"de"}, "Min. search should retrieve `de` only")
+        self.assertEqual(
+            form_only(row_case_sensitivity_maj), {"De"}, "Maj search should retrieve `De` only")
+        self.assertEqual(
+            form_only(rows_case_insensitivity), {"De", "de"}, "Insentivity should retrieve both forms")
+
+        seinz_sens = self.search(form="sein*", case_insensitivity=False)
+        seinz_insens = self.search(form="sein*", case_insensitivity=True)
+        self.assertEqual(
+            form_only(seinz_sens), {'seinz', 'seinte', 'seint'},
+            "Sensitivity should retrieve only form in minuscules")
+        self.assertEqual(
+            form_only(seinz_insens), {'seinz', 'seinte', 'seint', 'Seinz'},
+            "Insensitivity should retrieve all forms")